import datetime import os import re import gradio as gr import pandas as pd import yaml # Constants EVAL_CARDS_DIR = "eval_cards" TEMPLATE_PATH = "template.yaml" # Ensure the eval cards directory exists os.makedirs(EVAL_CARDS_DIR, exist_ok=True) # Copy the template to the appropriate location with open("template.yaml", "w") as f: with open("yaml_template.yaml", "r") as template_file: f.write(template_file.read()) def load_template(): """Load the YAML template""" with open(TEMPLATE_PATH, "r") as file: return file.read() def yaml_to_dict(yaml_str): """Convert YAML string to Python dictionary""" try: return yaml.safe_load(yaml_str) except yaml.YAMLError as e: return {"error": str(e)} def compute_coverage_score(eval_data): """ Compute a coverage score for the eval card Returns a score from 0-100 and a breakdown of coverage by section """ sections = { "metadata": 5, "evaluation_design": 10, "estimand": 20, "estimator": 20, "estimate": 20, "results_communication": 10, "known_issues_and_limitations": 10, "version_and_maintenance": 5, "citation_and_usage": 5, } scores = {} total_score = 0 def count_filled_fields(data, prefix=""): if isinstance(data, dict): filled = 0 total = 0 for key, value in data.items(): if isinstance(value, (dict, list)): sub_filled, sub_total = count_filled_fields( value, f"{prefix}.{key}" if prefix else key ) filled += sub_filled total += sub_total else: total += 1 if value and not ( isinstance(value, str) and value.strip() in ["", "[]", "{}"] ): filled += 1 return filled, total elif isinstance(data, list): if not data: return 0, 1 filled = 0 total = 0 for item in data: sub_filled, sub_total = count_filled_fields(item) filled += sub_filled total += sub_total return filled, total else: return 1 if data else 0, 1 # Compute scores for each section for section, weight in sections.items(): if section in eval_data: filled, total = count_filled_fields(eval_data[section]) completion_rate = filled / total if total > 0 else 0 scores[section] = { "score": round(completion_rate * weight, 2), "max_score": weight, "completion_rate": round(completion_rate * 100, 2), "fields_filled": filled, "fields_total": total, } total_score += scores[section]["score"] else: scores[section] = { "score": 0, "max_score": weight, "completion_rate": 0, "fields_filled": 0, "fields_total": 0, } return min(round(total_score, 2), 100), scores def get_llm_feedback(yaml_content, api_token=None): """ Get feedback on the eval card from Groq's LLM Uses GROQ_API_KEY from environment variables if no token is provided """ import os import requests from dotenv import load_dotenv # Load environment variables from .env file if it exists load_dotenv() # Use provided token or get from environment api_token = api_token or os.environ.get("GROQ_API_KEY") if not api_token: return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token." try: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_token}", } prompt = f""" I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness, consistency, and clarity. Provide specific recommendations for improvement. Focus on: 1. Sections that need more detail 2. Inconsistencies or contradictions 3. Clarity of language and explanations 4. Alignment with best practices for ML evaluation Here's the YAML content: ```yaml {yaml_content} ``` Provide your feedback in a structured format with specific, actionable recommendations. """ payload = { "model": "llama-3.3-70b-versatile", # or another groq supported model "messages": [{"role": "user", "content": prompt}], } response = requests.post( "https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, ) if response.status_code == 200: return response.json()["choices"][0]["message"]["content"] else: return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}" except Exception as e: return f"Error getting Groq LLM feedback: {str(e)}" def save_eval_card(yaml_content, paper_url="", repo_url=""): """Save an eval card with additional metadata""" try: eval_data = yaml.safe_load(yaml_content) # Add paper and repository links to metadata if paper_url: eval_data["metadata"]["paper_link"] = paper_url if repo_url: eval_data["metadata"]["repository_link"] = repo_url # Update the YAML content with the new metadata yaml_content = yaml.dump(eval_data) filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed")) filename = ( f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" ) file_path = os.path.join(EVAL_CARDS_DIR, filename) with open(file_path, "w") as file: file.write(yaml_content) return f"Evaluation card saved successfully as {filename}", file_path except Exception as e: return f"Error saving evaluation card: {str(e)}", None def load_all_eval_cards(): """Load all eval cards from the repository""" eval_cards = [] for filename in os.listdir(EVAL_CARDS_DIR): if filename.endswith(".yaml"): file_path = os.path.join(EVAL_CARDS_DIR, filename) try: with open(file_path, "r") as file: yaml_content = file.read() eval_data = yaml.safe_load(yaml_content) # Compute coverage score score, score_details = compute_coverage_score(eval_data) score = min(score, 100) # Extract key metadata eval_cards.append( { "filename": filename, "title": eval_data.get("title", "Unnamed Evaluation"), "summary": eval_data.get("summary", ""), "authors": ", ".join( eval_data.get("metadata", {}).get("authors", []) ), "creation_date": eval_data.get("metadata", {}).get( "creation_date", "" ), "coverage_score": score, "score_details": score_details, "yaml_content": yaml_content, "data": eval_data, } ) except Exception as e: print(f"Error loading {filename}: {str(e)}") return eval_cards def format_eval_card_as_html(eval_card): """Format an eval card as HTML for display""" html = f"""
{eval_card["summary"]}
Authors: {eval_card["authors"]}
Created: {eval_card["creation_date"]}
{f'Repository: {eval_card["data"]["metadata"].get("repository_link", "")}
' if eval_card["data"]["metadata"].get("repository_link") else ""} {f'Paper: {eval_card["data"]["metadata"].get("paper_link", "")}
' if eval_card["data"]["metadata"].get("paper_link") else ""}Coverage Score: {eval_card["coverage_score"]}%
Section | Score | Completion |
---|---|---|
{section} | {details["score"]}/{details["max_score"]} | {details["completion_rate"]}% |
No evaluation cards found.
" # Sort by coverage score (highest first) eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True) html = "" for eval_card in eval_cards: html += format_eval_card_as_html(eval_card) return html def upload_file(file): """Process an uploaded YAML file""" if file is None: return "No file uploaded", None try: yaml_content = file.decode("utf-8") # Validate YAML eval_data = yaml.safe_load(yaml_content) return yaml_content, eval_data except Exception as e: return f"Error processing file: {str(e)}", None def get_feedback(yaml_content): """Get LLM feedback on the eval card""" if not yaml_content: return "Please upload or paste a YAML file first." # Use provided token or get from environment api_token = os.environ.get("GROQ_API_KEY") if not api_token: return ( "Please provide an API token or set the GROQ_API_KEY environment variable." ) feedback = get_llm_feedback(yaml_content, api_token) return feedback def submit_eval_card(yaml_content, paper_url="", repo_url=""): """Submit an eval card to the repository""" if not yaml_content: return "Please upload or paste a YAML file first.", None, None try: # Validate YAML eval_data = yaml.safe_load(yaml_content) # Compute coverage score score, score_details = compute_coverage_score(eval_data) # Save eval card with URLs result, file_path = save_eval_card(yaml_content, paper_url, repo_url) if file_path: return ( f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details, ) else: return f"Error saving evaluation card: {result}", None, None except Exception as e: return f"Error processing evaluation card: {str(e)}", None, None def refresh_gallery(): """Refresh the gallery of eval cards""" eval_cards = load_all_eval_cards() html = create_eval_cards_table(eval_cards) # Convert data to pandas DataFrame for table view table_data = [] for card in eval_cards: author_counts = {} for card in eval_cards: authors = card["authors"].split(", ") for author in authors: if author in author_counts: author_counts[author] += 1 else: author_counts[author] = 1 top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20] top_authors = [author for author, count in top_authors] for card in eval_cards: authors = card["authors"].split(", ") filtered_authors = [author for author in authors if author in top_authors] table_data.append( { "Title": card["title"], "Authors": ", ".join(filtered_authors), "Creation Date": card["creation_date"], "Coverage Score": f"{card['coverage_score']}%", } ) df = pd.DataFrame(table_data) return html, df if not df.empty else None def handle_upload_tab(file_obj, yaml_text): """Handle upload tab actions - either use uploaded file or pasted text""" if file_obj is not None: yaml_content, eval_data = upload_file(file_obj) return yaml_content else: return yaml_text # Create the Gradio interface with gr.Blocks(title="Evaluation Cards Gallery") as app: with gr.Row(): with gr.Column(scale=2): gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ") gr.Markdown(""" Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery. checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information. """) with gr.Tabs(): with gr.TabItem("Upload & Review"): with gr.Row(): with gr.Column(): file_upload = gr.File( label="Upload YAML File", file_types=[".yaml", ".yml"] ) with gr.Accordion("Or paste YAML content", open=False): yaml_input = gr.TextArea( label="YAML Content", placeholder="Paste your YAML content here...", lines=10, ) paper_url_input = gr.Textbox( label="Paper URL (Optional)", placeholder="https://arxiv.org/abs/...", ) repo_url_input = gr.Textbox( label="Repository URL (Optional)", placeholder="https://github.com/...", ) load_template_btn = gr.Button("Load Template") # api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password") with gr.Row(): get_feedback_btn = gr.Button("Get LLM Feedback") submit_btn = gr.Button( "Submit Evaluation Card", variant="primary" ) with gr.Column(): yaml_display = gr.TextArea(label="Current YAML", lines=20) with gr.Accordion("LLM Feedback", open=True): feedback_display = gr.Markdown() with gr.Accordion("Submission Result", open=True): result_display = gr.Markdown() coverage_score = gr.Number( label="Coverage Score", visible=False ) coverage_details = gr.JSON( label="Coverage Details", visible=False ) with gr.TabItem("Gallery"): refresh_btn = gr.Button("Refresh Gallery") with gr.Tabs(): with gr.TabItem("Card View"): gallery_html = gr.HTML() with gr.TabItem("Table View"): gallery_table = gr.DataFrame() # Set up event handlers load_template_btn.click(fn=load_template, outputs=[yaml_display]) file_upload.change( fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display] ) yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display]) get_feedback_btn.click( fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display] ) submit_btn.click( fn=submit_eval_card, inputs=[yaml_display, paper_url_input, repo_url_input], outputs=[result_display, coverage_score, coverage_details], ) refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) # Initialize the gallery on app start app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) # Launch the app if __name__ == "__main__": app.launch()