File size: 4,589 Bytes
68ed9a2 1613f96 cc7bfac 1613f96 6bf4f4e bd9f032 05a2196 bd9f032 1613f96 9a0321d e9d6a57 1613f96 827199d 1613f96 9b88843 e9d6a57 1613f96 e9d6a57 1613f96 e9d6a57 1613f96 6bf4f4e 9e6e437 56cb573 2b85454 1613f96 2b85454 1613f96 e9f2fe4 1613f96 e9f2fe4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# flake8: noqa E501
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key, metric_key, title
task00 = Task("naive_judge", "score", "NaïveJudge")
task01 = Task("human_eval_solidity_pass_1", "score", "HumanEval for Solidity (pass@1)")
task02 = Task("human_eval_solidity_pass_3", "score", "HumanEval for Solidity (pass@3)")
task03 = Task("rouge1", "score", "ROUGE-unigrams")
task04 = Task("rouge2", "score", "ROUGE-bigrams")
task05 = Task("rougeL", "score", "ROUGE-Longest Common Subsequence")
task06 = Task("rougeLsum", "score", "ROUGE-Lsum")
task07 = Task("bleu", "score", "Bleu")
task08 = Task("brevity_penalty", "score", "Brevity Penalty")
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<br><img src="file/images/soliditybench.svg" width="500" style="display: block; margin-left: auto; margin-right: auto;">
<h3 align="center" id="space-title">Solidity Leaderboard by IQ</h3>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = ""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
# SolidityBench: Evaluating LLM Solidity Code Generation
SolidityBench is the first leaderboard for evaluating and ranking the ability of LLMs in Solidity code generation. Developed by BrainDAO as part of [IQ Code](https://iqcode.ai/), which aims to create a suite of AI models designed for generating and auditing smart contract code.
We introduce two benchmarks specifically designed for Solidity: NaïveJudge and HumanEval for Solidity.
## Benchmarks
### 1. NaïveJudge
NaïveJudge is a novel approach to smart contract evaluation, integrating a dataset of audited smart contracts from [OpenZeppelin](https://huggingface.co/datasets/braindao/soliditybench-naive-judge-openzeppelin-v1).
#### Evaluation Process:
- LLMs implement smart contracts based on detailed specifications.
- Generated code is compared to audited reference implementations.
- Evaluation is performed by SOTA LLMs (OpenAI GPT-4 and Claude 3.5 Sonnet) acting as impartial code reviewers.
#### Evaluation Criteria:
1. Functional Completeness (0-60 points)
- Implementation of key functionality
- Handling of edge cases
- Appropriate error management
2. Solidity Best Practices and Security (0-30 points)
- Correct and up-to-date Solidity syntax
- Adherence to best practices and design patterns
- Appropriate use of data types and visibility modifiers
- Code structure and maintainability
3. Optimization and Efficiency (0-10 points)
- Gas efficiency
- Avoidance of unnecessary computations
- Storage efficiency
- Overall performance compared to expert implementation
The final score ranges from 0 to 100, calculated by summing the points from each criterion.
### 2. HumanEval for Solidity
[HumanEval for Solidity](https://huggingface.co/datasets/braindao/humaneval-for-solidity-25) is an adaptation of OpenAI's original HumanEval benchmark, ported from Python to Solidity.
#### Dataset:
- 25 tasks of varying difficulty
- Each task includes corresponding tests designed for use with Hardhat
#### Evaluation Process:
- Custom server built on top of Hardhat compiles and tests the generated Solidity code
- Evaluates the AI model's ability to produce fully functional smart contracts
#### Metrics:
1. pass@1 (Score: 0-100)
- Measures the model's success on the first attempt
- Assesses precision and efficiency
2. pass@3 (Score: 0-100)
- Allows up to three attempts at solving each task
- Provides insights into the model's problem-solving capabilities over multiple tries
"""
EVALUATION_REQUESTS_TEXT = """
# ✔️ Check your model
Make sure that you can load your model and tokenizer using `AutoClasses`:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely that your model has been improperly uploaded.
"""
EVALUATION_SCRIPT = ''
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = ''
|