Spaces:

stacklok
/

secure_code_leaderboard_archived

Running

App Files Files Community

secure_code_leaderboard_archived / tests /test_evaluation.py

lukehinds

Logging and other improvements

dbdbe46 about 2 months ago

raw

history blame

3.27 kB

	"""Tests for core evaluation functionality."""
	import pytest
	import os
	import json
	from datetime import datetime
	from src.core.evaluation import EvaluationManager, EvaluationRequest

	@pytest.fixture
	def evaluation_manager(tmp_path):
	"""Create evaluation manager with temporary directories."""
	results_dir = tmp_path / "results"
	backup_dir = tmp_path / "backups"
	return EvaluationManager(str(results_dir), str(backup_dir))

	def test_evaluation_manager_init(evaluation_manager):
	"""Test evaluation manager initialization."""
	assert os.path.exists(evaluation_manager.results_dir)
	assert os.path.exists(evaluation_manager.backup_dir)

	def test_backup_results(evaluation_manager):
	"""Test backup creation."""
	# Create test results
	eval_id = "test_model_main"
	result_path = os.path.join(evaluation_manager.results_dir, f"{eval_id}.json")
	test_results = {"test": "data"}

	os.makedirs(os.path.dirname(result_path), exist_ok=True)
	with open(result_path, 'w') as f:
	json.dump(test_results, f)

	# Create backup
	evaluation_manager.backup_results(eval_id)

	# Check backup exists
	backup_files = os.listdir(evaluation_manager.backup_dir)
	assert len(backup_files) == 1
	assert backup_files[0].startswith(eval_id)

	def test_run_evaluation(evaluation_manager):
	"""Test full evaluation run."""
	request = EvaluationRequest(
	model_id="hf-internal-testing/tiny-random-gpt2",
	revision="main",
	precision="float16",
	weight_type="Safetensors",
	submitted_time=datetime.now()
	)

	results = evaluation_manager.run_evaluation(request)

	assert results["model_id"] == request.model_id
	assert results["revision"] == request.revision
	assert "security_score" in results
	assert "safetensors_compliant" in results

	def test_evaluation_error_handling(evaluation_manager):
	"""Test error handling during evaluation."""
	request = EvaluationRequest(
	model_id="invalid/model",
	revision="main",
	precision="float16",
	weight_type="Safetensors",
	submitted_time=datetime.now()
	)

	with pytest.raises(Exception):
	evaluation_manager.run_evaluation(request)

	def test_concurrent_evaluations(evaluation_manager, tmp_path):
	"""Test handling of concurrent evaluations."""
	import threading
	import time

	def run_eval(model_id):
	request = EvaluationRequest(
	model_id=model_id,
	revision="main",
	precision="float16",
	weight_type="Safetensors",
	submitted_time=datetime.now()
	)
	try:
	evaluation_manager.run_evaluation(request)
	except Exception:
	pass

	# Start multiple evaluation threads
	threads = []
	for i in range(3):
	thread = threading.Thread(
	target=run_eval,
	args=(f"model_{i}",)
	)
	threads.append(thread)
	thread.start()

	# Wait for all threads to complete
	for thread in threads:
	thread.join()

	# Check results directory integrity
	assert os.path.exists(evaluation_manager.results_dir)
	assert os.path.exists(evaluation_manager.backup_dir)