from datetime import datetime, timedelta from fastapi import APIRouter, HTTPException, Depends import logging from lm_eval import evaluator from svc.schemas import LMHarnessTaskRequest, TaskResponse, LoadModelRequest, DeepEvalSuiteRequest from fastapi.security import OAuth2PasswordRequestForm from auth.authentication import get_current_user, create_access_token from dotenv import load_dotenv import os import json from pathlib import Path from src.deepeval.deepeval_task_manager import DeepEvalTaskManager import torch import gc from time import time from huggingface_hub import HfApi, ModelInfo import threading router = APIRouter() logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") # Or configure a HfApi client hf_api = HfApi( token=HF_TOKEN, # Token is not persisted on the machine. ) @router.post("/token") async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): auth_token = os.getenv("AUTH_UUID") if auth_token != form_data.password: raise HTTPException(status_code=400, detail="Incorrect username or password") access_token = create_access_token(data={"sub": form_data.username}) return {"access_token": access_token, "token_type": "bearer"} @router.get("/protected") async def protected_route(username: str = Depends(get_current_user)): return {"message": f"Hello, {username}! This is a protected resource."} @router.get("/deepeval/status") async def deep_eval_status(): #Return running with 200 status code return {"status": "running"} @router.get("/deepeval/hardware") def hardware_status(): info = get_gpu_tier() print("Hardware Response:", info) return info @router.post("/chat", response_model=TaskResponse) def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)): logger.info(request) try: logger.info("Inside") results = evaluator.simple_evaluate( model=request.model, model_args=request.model_args, tasks=request.tasks, num_fewshot=request.num_fewshot, batch_size=request.batch_size, device=request.device, limit=request.limit, write_out=request.write_out # Whether to write out an example document and model input, for checking task integrity ) except Exception as e: raise HTTPException(status_code=500, detail=f"lm-harness task execution failed for model: {request.model_args}") torch.cuda.empty_cache() results["config"]["model_dtype"] = request.precision model_name = request.model_args.split(",")[0].split("=")[1] results["config"]["model_name"] = model_name results["config"]["model_sha"] = request.model_sha dumped = json.dumps(results, indent=2) logger.info("-------------------results------------------\n") logger.info(dumped) logger.info("-------------------results end------------------\n") return TaskResponse(results=dumped) @router.post("/deepeval/eval", response_model=TaskResponse) def deep_eval_suite(request: DeepEvalSuiteRequest): def run_in_background(): try: torch.cuda.empty_cache() #des = DeepEvalTaskManager(request.model_name, request.tasks, request.dtype, request.weight, request.base_model) des = DeepEvalTaskManager(request.model_name, request.tasks) start_time = time() results = des.run_tasks() end_time = time() duration = round(end_time - start_time, 2) model_info: ModelInfo = hf_api.model_info(request.model_name) config = { "model_source": "hf", "num_fewshot": 0, "batch_size": 8, "device": "cuda:0", "model_dtype": "torch.float16", "model_name": request.model_name, "model_sha": model_info.sha, } final_results = { "results": results, "config": config, "total_evaluation_time_seconds": duration, "start_time": start_time, "end_time": end_time } # Save and upload dumped = json.dumps(final_results, indent=2) path = Path("/tmp", request.model_name, f"results_{datetime.now()}.json") path.parent.mkdir(parents=True, exist_ok=True) path.write_text(dumped) RESULTS_REPO = "metunlp/results" hf_api.upload_file( path_or_fileobj=path, path_in_repo=path.relative_to("/tmp").as_posix(), repo_id=RESULTS_REPO, repo_type="dataset", ) logger.info(f"✅ Uploaded results to HF Hub for {request.model_name}") except Exception as e: logger.exception(f"❌ Background evaluation failed: {e}") # 🔁 Start evaluation in background threading.Thread(target=run_in_background, daemon=True).start() # ✅ Immediately respond return TaskResponse(results=json.dumps({"status": "Evaluation started in background"})) def get_gpu_tier(): if not torch.cuda.is_available(): return {"gpu": "CPU", "tier": "cpu"} device_count = torch.cuda.device_count() gpu_names = [torch.cuda.get_device_name(i).lower() for i in range(device_count)] # Count how many of each GPU type we care about l4_count = sum("l4" in name and "l40s" not in name for name in gpu_names) l40s_count = sum("l40s" in name for name in gpu_names) if l4_count == device_count: return {"gpu": "NVIDIA L4", "tier": f"l4x{l4_count}"} elif l40s_count == device_count: return {"gpu": "NVIDIA L40S", "tier": f"l40sx{l40s_count}"} elif "t4" in gpu_names[0]: return {"gpu": "Tesla T4", "tier": "t4-medium"} elif "a10g" in gpu_names[0]: return {"gpu": "NVIDIA A10G", "tier": "a10g"} else: return {"gpu": gpu_names[0], "tier": "unknown"}