Upload from GitHub Actions: Try moving `cache` calls that cause CI issues
Browse files- evals/models.py +5 -4
- evals/tasks.py +6 -11
evals/models.py
CHANGED
@@ -93,9 +93,10 @@ def get_current_popular_models(date: date):
|
|
93 |
return [get_model(model["model_permaslug"]) for model in data]
|
94 |
|
95 |
|
96 |
-
popular_models =
|
97 |
-
date.today()
|
98 |
-
|
|
|
99 |
popular_models = [get_model(m) for m in popular_models if get_model(m)]
|
100 |
popular_models = [
|
101 |
m for m in popular_models if m["endpoint"] and not m["endpoint"]["is_free"]
|
@@ -104,7 +105,7 @@ popular_models = [m["slug"] for m in popular_models]
|
|
104 |
popular_models = [
|
105 |
m for m in popular_models if m and m not in models and m not in blocklist
|
106 |
]
|
107 |
-
models += popular_models
|
108 |
|
109 |
load_dotenv()
|
110 |
client = AsyncOpenAI(
|
|
|
93 |
return [get_model(model["model_permaslug"]) for model in data]
|
94 |
|
95 |
|
96 |
+
popular_models = (
|
97 |
+
get_historical_popular_models(date.today())[:5]
|
98 |
+
+ get_current_popular_models(date.today())[:5]
|
99 |
+
)
|
100 |
popular_models = [get_model(m) for m in popular_models if get_model(m)]
|
101 |
popular_models = [
|
102 |
m for m in popular_models if m["endpoint"] and not m["endpoint"]["is_free"]
|
|
|
105 |
popular_models = [
|
106 |
m for m in popular_models if m and m not in models and m not in blocklist
|
107 |
]
|
108 |
+
models += popular_models
|
109 |
|
110 |
load_dotenv()
|
111 |
client = AsyncOpenAI(
|
evals/tasks.py
CHANGED
@@ -24,7 +24,6 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
24 |
)
|
25 |
|
26 |
|
27 |
-
@cache
|
28 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
29 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
30 |
target_language = target_languages.iloc[sentence_nr]
|
@@ -78,7 +77,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
78 |
# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
79 |
|
80 |
|
81 |
-
@cache
|
82 |
async def classify_and_evaluate(model, bcp_47, nr):
|
83 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
84 |
sentences = flores_sentences(language)
|
@@ -161,7 +159,6 @@ def corrupt_sentence(sentence):
|
|
161 |
return sentence[:start] + "<mask>" + sentence[end:]
|
162 |
|
163 |
|
164 |
-
@cache
|
165 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
166 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
167 |
sentences = flores_sentences(language)
|
@@ -206,7 +203,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
206 |
]
|
207 |
|
208 |
|
209 |
-
@cache
|
210 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
211 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
212 |
if not task:
|
@@ -254,7 +250,6 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
254 |
]
|
255 |
|
256 |
|
257 |
-
@cache
|
258 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
259 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
260 |
fleurs = pd.read_csv(
|
@@ -287,10 +282,10 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
287 |
|
288 |
|
289 |
tasks = {
|
290 |
-
"translation_from": partial(translate_and_evaluate, mode="from"),
|
291 |
-
"translation_to": partial(translate_and_evaluate, mode="to"),
|
292 |
-
# "classification": classify_and_evaluate,
|
293 |
-
# "mlm": mlm_and_evaluate,
|
294 |
-
"mmlu": mmlu_and_evaluate,
|
295 |
-
# "asr": transcribe_and_evaluate,
|
296 |
}
|
|
|
24 |
)
|
25 |
|
26 |
|
|
|
27 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
28 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
29 |
target_language = target_languages.iloc[sentence_nr]
|
|
|
77 |
# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
78 |
|
79 |
|
|
|
80 |
async def classify_and_evaluate(model, bcp_47, nr):
|
81 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
82 |
sentences = flores_sentences(language)
|
|
|
159 |
return sentence[:start] + "<mask>" + sentence[end:]
|
160 |
|
161 |
|
|
|
162 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
163 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
164 |
sentences = flores_sentences(language)
|
|
|
203 |
]
|
204 |
|
205 |
|
|
|
206 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
207 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
208 |
if not task:
|
|
|
250 |
]
|
251 |
|
252 |
|
|
|
253 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
254 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
255 |
fleurs = pd.read_csv(
|
|
|
282 |
|
283 |
|
284 |
tasks = {
|
285 |
+
"translation_from": cache(partial(translate_and_evaluate, mode="from")),
|
286 |
+
"translation_to": cache(partial(translate_and_evaluate, mode="to")),
|
287 |
+
# "classification": cache(classify_and_evaluate),
|
288 |
+
# "mlm": cache(mlm_and_evaluate),
|
289 |
+
"mmlu": cache(mmlu_and_evaluate),
|
290 |
+
# "asr": cache(transcribe_and_evaluate),
|
291 |
}
|