Upload from GitHub Actions: Merge pull request #13 from datenlabor-bmz/jn-dev
Browse files- .DS_Store +0 -0
- evals/models.py +4 -2
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
evals/models.py
CHANGED
@@ -23,6 +23,7 @@ important_models = [
|
|
23 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
24 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
25 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
|
|
26 |
"openai/gpt-4.1", # 8$
|
27 |
"openai/gpt-4.1-mini", # 1.6$
|
28 |
"openai/gpt-4.1-nano", # 0.4$
|
@@ -54,6 +55,7 @@ important_models = [
|
|
54 |
|
55 |
blocklist = [
|
56 |
"google/gemini-2.5-pro-preview",
|
|
|
57 |
"google/gemini-2.5-flash-preview",
|
58 |
"google/gemini-2.5-flash-lite-preview",
|
59 |
"google/gemini-2.5-flash-preview-04-17",
|
@@ -236,7 +238,7 @@ async def complete(**kwargs) -> str | None:
|
|
236 |
return None
|
237 |
raise e
|
238 |
except asyncio.TimeoutError:
|
239 |
-
print(f"⏰ Timeout after {timeout}s for model {
|
240 |
return None
|
241 |
if not response.choices:
|
242 |
raise Exception(response)
|
@@ -407,7 +409,7 @@ def load_models(date: date):
|
|
407 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
408 |
)
|
409 |
# Filter out expensive models to keep costs reasonable
|
410 |
-
models = models[models["cost"] <=
|
411 |
models["tasks"] = [
|
412 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
413 |
] * len(models)
|
|
|
23 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
24 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
25 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
26 |
+
"openai/gpt-5", # include if/when available
|
27 |
"openai/gpt-4.1", # 8$
|
28 |
"openai/gpt-4.1-mini", # 1.6$
|
29 |
"openai/gpt-4.1-nano", # 0.4$
|
|
|
55 |
|
56 |
blocklist = [
|
57 |
"google/gemini-2.5-pro-preview",
|
58 |
+
"google/gemini-2.5-pro",
|
59 |
"google/gemini-2.5-flash-preview",
|
60 |
"google/gemini-2.5-flash-lite-preview",
|
61 |
"google/gemini-2.5-flash-preview-04-17",
|
|
|
238 |
return None
|
239 |
raise e
|
240 |
except asyncio.TimeoutError:
|
241 |
+
print(f"⏰ Timeout after {timeout}s for model {model_id}")
|
242 |
return None
|
243 |
if not response.choices:
|
244 |
raise Exception(response)
|
|
|
409 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
410 |
)
|
411 |
# Filter out expensive models to keep costs reasonable
|
412 |
+
models = models[models["cost"] <= 15.0].reset_index(drop=True)
|
413 |
models["tasks"] = [
|
414 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
415 |
] * len(models)
|