lukehinds's picture
Logging and other improvements
dbdbe46
raw
history blame
3.27 kB
"""Tests for core evaluation functionality."""
import pytest
import os
import json
from datetime import datetime
from src.core.evaluation import EvaluationManager, EvaluationRequest
@pytest.fixture
def evaluation_manager(tmp_path):
"""Create evaluation manager with temporary directories."""
results_dir = tmp_path / "results"
backup_dir = tmp_path / "backups"
return EvaluationManager(str(results_dir), str(backup_dir))
def test_evaluation_manager_init(evaluation_manager):
"""Test evaluation manager initialization."""
assert os.path.exists(evaluation_manager.results_dir)
assert os.path.exists(evaluation_manager.backup_dir)
def test_backup_results(evaluation_manager):
"""Test backup creation."""
# Create test results
eval_id = "test_model_main"
result_path = os.path.join(evaluation_manager.results_dir, f"{eval_id}.json")
test_results = {"test": "data"}
os.makedirs(os.path.dirname(result_path), exist_ok=True)
with open(result_path, 'w') as f:
json.dump(test_results, f)
# Create backup
evaluation_manager.backup_results(eval_id)
# Check backup exists
backup_files = os.listdir(evaluation_manager.backup_dir)
assert len(backup_files) == 1
assert backup_files[0].startswith(eval_id)
def test_run_evaluation(evaluation_manager):
"""Test full evaluation run."""
request = EvaluationRequest(
model_id="hf-internal-testing/tiny-random-gpt2",
revision="main",
precision="float16",
weight_type="Safetensors",
submitted_time=datetime.now()
)
results = evaluation_manager.run_evaluation(request)
assert results["model_id"] == request.model_id
assert results["revision"] == request.revision
assert "security_score" in results
assert "safetensors_compliant" in results
def test_evaluation_error_handling(evaluation_manager):
"""Test error handling during evaluation."""
request = EvaluationRequest(
model_id="invalid/model",
revision="main",
precision="float16",
weight_type="Safetensors",
submitted_time=datetime.now()
)
with pytest.raises(Exception):
evaluation_manager.run_evaluation(request)
def test_concurrent_evaluations(evaluation_manager, tmp_path):
"""Test handling of concurrent evaluations."""
import threading
import time
def run_eval(model_id):
request = EvaluationRequest(
model_id=model_id,
revision="main",
precision="float16",
weight_type="Safetensors",
submitted_time=datetime.now()
)
try:
evaluation_manager.run_evaluation(request)
except Exception:
pass
# Start multiple evaluation threads
threads = []
for i in range(3):
thread = threading.Thread(
target=run_eval,
args=(f"model_{i}",)
)
threads.append(thread)
thread.start()
# Wait for all threads to complete
for thread in threads:
thread.join()
# Check results directory integrity
assert os.path.exists(evaluation_manager.results_dir)
assert os.path.exists(evaluation_manager.backup_dir)