Spaces:
Running
on
T4
Running
on
T4
File size: 6,131 Bytes
61e04e7 e2ebff5 249c88f 7ec6be5 99e165e 7ec6be5 e2ebff5 f66ecbf 24a5ac7 99e165e aee6c99 79a1b57 c12a803 24a5ac7 c12a803 38cf5ff 249c88f 03cbb60 99e165e c12a803 03cbb60 3a6903d 03cbb60 249c88f ddb6316 249c88f bc60ab4 8930e56 6d65a79 8930e56 99e165e 57ba66f aee6c99 249c88f f66ecbf 249c88f aee6c99 f66ecbf 38cf5ff f66ecbf 99e165e 24a5ac7 99e165e 24a5ac7 211b909 24a5ac7 41affa9 61e04e7 24a5ac7 38cf5ff ab80e4c 8930e56 6d65a79 8930e56 6d65a79 db96c4e 6d65a79 8930e56 6d65a79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
from datetime import datetime, timedelta
from fastapi import APIRouter, HTTPException, Depends
import logging
from lm_eval import evaluator
from svc.schemas import LMHarnessTaskRequest, TaskResponse, LoadModelRequest, DeepEvalSuiteRequest
from fastapi.security import OAuth2PasswordRequestForm
from auth.authentication import get_current_user, create_access_token
from dotenv import load_dotenv
import os
import json
from pathlib import Path
from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
import torch
import gc
from time import time
from huggingface_hub import HfApi, ModelInfo
import threading
router = APIRouter()
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Or configure a HfApi client
hf_api = HfApi(
token=HF_TOKEN, # Token is not persisted on the machine.
)
@router.post("/token")
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
auth_token = os.getenv("AUTH_UUID")
if auth_token != form_data.password:
raise HTTPException(status_code=400, detail="Incorrect username or password")
access_token = create_access_token(data={"sub": form_data.username})
return {"access_token": access_token, "token_type": "bearer"}
@router.get("/protected")
async def protected_route(username: str = Depends(get_current_user)):
return {"message": f"Hello, {username}! This is a protected resource."}
@router.get("/deepeval/status")
async def deep_eval_status():
#Return running with 200 status code
return {"status": "running"}
@router.get("/deepeval/hardware")
def hardware_status():
info = get_gpu_tier()
print("Hardware Response:", info)
return info
@router.post("/chat", response_model=TaskResponse)
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
logger.info(request)
try:
logger.info("Inside")
results = evaluator.simple_evaluate(
model=request.model,
model_args=request.model_args,
tasks=request.tasks,
num_fewshot=request.num_fewshot,
batch_size=request.batch_size,
device=request.device,
limit=request.limit,
write_out=request.write_out # Whether to write out an example document and model input, for checking task integrity
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"lm-harness task execution failed for model: {request.model_args}")
torch.cuda.empty_cache()
results["config"]["model_dtype"] = request.precision
model_name = request.model_args.split(",")[0].split("=")[1]
results["config"]["model_name"] = model_name
results["config"]["model_sha"] = request.model_sha
dumped = json.dumps(results, indent=2)
logger.info("-------------------results------------------\n")
logger.info(dumped)
logger.info("-------------------results end------------------\n")
return TaskResponse(results=dumped)
@router.post("/deepeval/eval", response_model=TaskResponse)
def deep_eval_suite(request: DeepEvalSuiteRequest):
def run_in_background():
try:
torch.cuda.empty_cache()
#des = DeepEvalTaskManager(request.model_name, request.tasks, request.dtype, request.weight, request.base_model)
des = DeepEvalTaskManager(request.model_name, request.tasks)
start_time = time()
results = des.run_tasks()
end_time = time()
duration = round(end_time - start_time, 2)
model_info: ModelInfo = hf_api.model_info(request.model_name)
config = {
"model_source": "hf",
"num_fewshot": 0,
"batch_size": 8,
"device": "cuda:0",
"model_dtype": "torch.float16",
"model_name": request.model_name,
"model_sha": model_info.sha,
}
final_results = {
"results": results,
"config": config,
"total_evaluation_time_seconds": duration,
"start_time": start_time,
"end_time": end_time
}
# Save and upload
dumped = json.dumps(final_results, indent=2)
path = Path("/tmp", request.model_name, f"results_{datetime.now()}.json")
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(dumped)
RESULTS_REPO = "metunlp/results"
hf_api.upload_file(
path_or_fileobj=path,
path_in_repo=path.relative_to("/tmp").as_posix(),
repo_id=RESULTS_REPO,
repo_type="dataset",
)
logger.info(f"β
Uploaded results to HF Hub for {request.model_name}")
except Exception as e:
logger.exception(f"β Background evaluation failed: {e}")
# π Start evaluation in background
threading.Thread(target=run_in_background, daemon=True).start()
# β
Immediately respond
return TaskResponse(results=json.dumps({"status": "Evaluation started in background"}))
def get_gpu_tier():
if not torch.cuda.is_available():
return {"gpu": "CPU", "tier": "cpu"}
device_count = torch.cuda.device_count()
gpu_names = [torch.cuda.get_device_name(i).lower() for i in range(device_count)]
# Count how many of each GPU type we care about
l4_count = sum("l4" in name and "l40s" not in name for name in gpu_names)
l40s_count = sum("l40s" in name for name in gpu_names)
if l4_count == device_count:
return {"gpu": "NVIDIA L4", "tier": f"l4x{l4_count}"}
elif l40s_count == device_count:
return {"gpu": "NVIDIA L40S", "tier": f"l40sx{l40s_count}"}
elif "t4" in gpu_names[0]:
return {"gpu": "Tesla T4", "tier": "t4-medium"}
elif "a10g" in gpu_names[0]:
return {"gpu": "NVIDIA A10G", "tier": "a10g"}
else:
return {"gpu": gpu_names[0], "tier": "unknown"} |