Logging and other improvements
Browse files- app.py +119 -19
- src/core/evaluation.py +105 -0
- src/core/queue_manager.py +124 -0
- src/logging_config.py +71 -0
- tests/test_evaluation.py +103 -0
- tests/test_queue.py +128 -0
app.py
CHANGED
@@ -3,7 +3,13 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
import pandas as pd
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
9 |
CITATION_BUTTON_TEXT,
|
@@ -24,29 +30,72 @@ from src.display.utils import (
|
|
24 |
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
-
from src.envs import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def restart_space():
|
|
|
|
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
except Exception:
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
@@ -57,12 +106,40 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def init_leaderboard(df):
|
61 |
"""Initialize the leaderboard with the given DataFrame."""
|
62 |
if df is None or df.empty:
|
63 |
-
# Create an empty DataFrame with the required columns
|
64 |
df = pd.DataFrame(columns=COLS)
|
65 |
-
|
66 |
|
67 |
# Create the leaderboard
|
68 |
return gr.Dataframe(
|
@@ -183,8 +260,27 @@ with demo:
|
|
183 |
|
184 |
submit_button = gr.Button("Submit for Security Evaluation")
|
185 |
submission_result = gr.Markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
submit_button.click(
|
187 |
-
|
188 |
[
|
189 |
model_name_textbox,
|
190 |
base_model_name_textbox,
|
@@ -206,7 +302,11 @@ with demo:
|
|
206 |
show_copy_button=True,
|
207 |
)
|
208 |
|
|
|
209 |
scheduler = BackgroundScheduler()
|
210 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
211 |
scheduler.start()
|
|
|
|
|
212 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
import pandas as pd
|
6 |
+
import os
|
7 |
+
import logging
|
8 |
+
from datetime import datetime
|
9 |
|
10 |
+
from src.core.evaluation import EvaluationManager, EvaluationRequest
|
11 |
+
from src.core.queue_manager import QueueManager
|
12 |
+
from src.logging_config import setup_logging
|
13 |
from src.about import (
|
14 |
CITATION_BUTTON_LABEL,
|
15 |
CITATION_BUTTON_TEXT,
|
|
|
30 |
WeightType,
|
31 |
Precision
|
32 |
)
|
33 |
+
from src.envs import (
|
34 |
+
API,
|
35 |
+
CACHE_PATH,
|
36 |
+
EVAL_REQUESTS_PATH,
|
37 |
+
EVAL_RESULTS_PATH,
|
38 |
+
QUEUE_REPO,
|
39 |
+
REPO_ID,
|
40 |
+
RESULTS_REPO,
|
41 |
+
TOKEN
|
42 |
+
)
|
43 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
44 |
from src.submission.submit import add_new_eval
|
45 |
|
46 |
|
47 |
+
# Setup logging
|
48 |
+
setup_logging(log_dir="logs")
|
49 |
+
logger = logging.getLogger('web')
|
50 |
+
|
51 |
+
# Initialize managers
|
52 |
+
evaluation_manager = EvaluationManager(
|
53 |
+
results_dir=EVAL_RESULTS_PATH,
|
54 |
+
backup_dir=os.path.join(CACHE_PATH, "eval-backups")
|
55 |
+
)
|
56 |
+
|
57 |
+
queue_manager = QueueManager(
|
58 |
+
queue_dir=os.path.join(CACHE_PATH, "eval-queue")
|
59 |
+
)
|
60 |
+
|
61 |
def restart_space():
|
62 |
+
"""Restart the Hugging Face space."""
|
63 |
+
logger.info("Restarting space")
|
64 |
API.restart_space(repo_id=REPO_ID)
|
65 |
|
66 |
+
def initialize_space():
|
67 |
+
"""Initialize the space by downloading required data."""
|
68 |
+
logger.info("Initializing space")
|
69 |
+
try:
|
70 |
+
logger.info(f"Downloading queue data from {QUEUE_REPO}")
|
71 |
+
snapshot_download(
|
72 |
+
repo_id=QUEUE_REPO,
|
73 |
+
local_dir=EVAL_REQUESTS_PATH,
|
74 |
+
repo_type="dataset",
|
75 |
+
tqdm_class=None,
|
76 |
+
etag_timeout=30,
|
77 |
+
token=TOKEN
|
78 |
+
)
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Failed to download queue data: {str(e)}")
|
81 |
+
restart_space()
|
82 |
+
|
83 |
+
try:
|
84 |
+
logger.info(f"Downloading results data from {RESULTS_REPO}")
|
85 |
+
snapshot_download(
|
86 |
+
repo_id=RESULTS_REPO,
|
87 |
+
local_dir=EVAL_RESULTS_PATH,
|
88 |
+
repo_type="dataset",
|
89 |
+
tqdm_class=None,
|
90 |
+
etag_timeout=30,
|
91 |
+
token=TOKEN
|
92 |
+
)
|
93 |
+
except Exception as e:
|
94 |
+
logger.error(f"Failed to download results data: {str(e)}")
|
95 |
+
restart_space()
|
96 |
+
|
97 |
+
# Initialize space
|
98 |
+
initialize_space()
|
99 |
|
100 |
|
101 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
106 |
pending_eval_queue_df,
|
107 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
108 |
|
109 |
+
def process_evaluation_queue():
|
110 |
+
"""Process pending evaluation requests."""
|
111 |
+
logger.info("Processing evaluation queue")
|
112 |
+
while True:
|
113 |
+
request = queue_manager.get_next_request()
|
114 |
+
if not request:
|
115 |
+
break
|
116 |
+
|
117 |
+
try:
|
118 |
+
# Convert queue request to evaluation request
|
119 |
+
eval_request = EvaluationRequest(
|
120 |
+
model_id=request.model_id,
|
121 |
+
revision=request.revision,
|
122 |
+
precision="float16", # Default precision
|
123 |
+
weight_type="Safetensors",
|
124 |
+
submitted_time=request.timestamp
|
125 |
+
)
|
126 |
+
|
127 |
+
# Run evaluation
|
128 |
+
results = evaluation_manager.run_evaluation(eval_request)
|
129 |
+
logger.info(f"Evaluation complete for {request.model_id}")
|
130 |
+
|
131 |
+
# Mark request as complete
|
132 |
+
queue_manager.mark_complete(request.request_id)
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
logger.error(f"Evaluation failed for {request.model_id}: {str(e)}")
|
136 |
+
# Keep request in active queue for retry
|
137 |
+
|
138 |
def init_leaderboard(df):
|
139 |
"""Initialize the leaderboard with the given DataFrame."""
|
140 |
if df is None or df.empty:
|
|
|
141 |
df = pd.DataFrame(columns=COLS)
|
142 |
+
logger.info("Creating empty leaderboard - no evaluations completed yet")
|
143 |
|
144 |
# Create the leaderboard
|
145 |
return gr.Dataframe(
|
|
|
260 |
|
261 |
submit_button = gr.Button("Submit for Security Evaluation")
|
262 |
submission_result = gr.Markdown()
|
263 |
+
def handle_submission(model, base_model, revision, precision, weight_type, model_type):
|
264 |
+
"""Handle new model submission."""
|
265 |
+
try:
|
266 |
+
logger.info(f"New submission received for {model}")
|
267 |
+
|
268 |
+
# Add to queue
|
269 |
+
request_id = queue_manager.add_request(
|
270 |
+
model_id=model,
|
271 |
+
revision=revision if revision else "main"
|
272 |
+
)
|
273 |
+
|
274 |
+
# Process queue
|
275 |
+
process_evaluation_queue()
|
276 |
+
|
277 |
+
return gr.Markdown("Submission successful! Your model has been added to the evaluation queue.")
|
278 |
+
except Exception as e:
|
279 |
+
logger.error(f"Submission failed: {str(e)}")
|
280 |
+
return gr.Markdown(f"Error: {str(e)}")
|
281 |
+
|
282 |
submit_button.click(
|
283 |
+
handle_submission,
|
284 |
[
|
285 |
model_name_textbox,
|
286 |
base_model_name_textbox,
|
|
|
302 |
show_copy_button=True,
|
303 |
)
|
304 |
|
305 |
+
# Setup schedulers
|
306 |
scheduler = BackgroundScheduler()
|
307 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
308 |
+
scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes
|
309 |
scheduler.start()
|
310 |
+
|
311 |
+
logger.info("Application startup complete")
|
312 |
demo.queue(default_concurrency_limit=40).launch()
|
src/core/evaluation.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Core evaluation logic separated from web interface."""
|
2 |
+
import logging
|
3 |
+
from typing import Dict, Any, Optional
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from datetime import datetime
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
|
9 |
+
from src.leaderboard.security_eval import (
|
10 |
+
check_safetensors,
|
11 |
+
evaluate_secure_coding,
|
12 |
+
load_model_and_tokenizer,
|
13 |
+
)
|
14 |
+
|
15 |
+
# Configure logging
|
16 |
+
logging.basicConfig(
|
17 |
+
level=logging.INFO,
|
18 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
19 |
+
handlers=[
|
20 |
+
logging.FileHandler('evaluation.log'),
|
21 |
+
logging.StreamHandler()
|
22 |
+
]
|
23 |
+
)
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
+
@dataclass
|
27 |
+
class EvaluationRequest:
|
28 |
+
"""Data class for evaluation requests."""
|
29 |
+
model_id: str
|
30 |
+
revision: str
|
31 |
+
precision: str
|
32 |
+
weight_type: str
|
33 |
+
submitted_time: datetime
|
34 |
+
base_model: Optional[str] = None
|
35 |
+
|
36 |
+
class EvaluationManager:
|
37 |
+
"""Manages the evaluation pipeline and state."""
|
38 |
+
|
39 |
+
def __init__(self, results_dir: str, backup_dir: str):
|
40 |
+
self.results_dir = results_dir
|
41 |
+
self.backup_dir = backup_dir
|
42 |
+
self._ensure_directories()
|
43 |
+
self.logger = logging.getLogger(__name__)
|
44 |
+
|
45 |
+
def _ensure_directories(self) -> None:
|
46 |
+
"""Ensure required directories exist."""
|
47 |
+
os.makedirs(self.results_dir, exist_ok=True)
|
48 |
+
os.makedirs(self.backup_dir, exist_ok=True)
|
49 |
+
|
50 |
+
def backup_results(self, eval_id: str) -> None:
|
51 |
+
"""Create backup of evaluation results."""
|
52 |
+
try:
|
53 |
+
result_path = os.path.join(self.results_dir, f"{eval_id}.json")
|
54 |
+
backup_path = os.path.join(self.backup_dir, f"{eval_id}_{datetime.now().isoformat()}.json")
|
55 |
+
|
56 |
+
if os.path.exists(result_path):
|
57 |
+
with open(result_path, 'r') as src, open(backup_path, 'w') as dst:
|
58 |
+
json.dump(json.load(src), dst, indent=2)
|
59 |
+
self.logger.info(f"Created backup at {backup_path}")
|
60 |
+
except Exception as e:
|
61 |
+
self.logger.error(f"Backup failed for {eval_id}: {str(e)}")
|
62 |
+
|
63 |
+
def run_evaluation(self, request: EvaluationRequest) -> Dict[str, Any]:
|
64 |
+
"""Run evaluation pipeline for a model."""
|
65 |
+
try:
|
66 |
+
self.logger.info(f"Starting evaluation for {request.model_id}")
|
67 |
+
|
68 |
+
# Run security checks
|
69 |
+
safetensors_compliant = check_safetensors(request.model_id, request.revision)
|
70 |
+
self.logger.info(f"Safetensors check: {safetensors_compliant}")
|
71 |
+
|
72 |
+
# Load model for evaluation
|
73 |
+
model, tokenizer = load_model_and_tokenizer(request.model_id, request.revision)
|
74 |
+
self.logger.info("Model loaded successfully")
|
75 |
+
|
76 |
+
# Run security evaluation
|
77 |
+
security_score = evaluate_secure_coding(request.model_id, request.revision)
|
78 |
+
self.logger.info(f"Security evaluation complete. Score: {security_score}")
|
79 |
+
|
80 |
+
# Compile results
|
81 |
+
results = {
|
82 |
+
"model_id": request.model_id,
|
83 |
+
"revision": request.revision,
|
84 |
+
"evaluation_time": datetime.now().isoformat(),
|
85 |
+
"safetensors_compliant": safetensors_compliant,
|
86 |
+
"security_score": security_score,
|
87 |
+
"precision": request.precision,
|
88 |
+
"weight_type": request.weight_type,
|
89 |
+
}
|
90 |
+
|
91 |
+
# Save and backup results
|
92 |
+
eval_id = f"{request.model_id.replace('/', '_')}_{request.revision}"
|
93 |
+
result_path = os.path.join(self.results_dir, f"{eval_id}.json")
|
94 |
+
|
95 |
+
with open(result_path, 'w') as f:
|
96 |
+
json.dump(results, f, indent=2)
|
97 |
+
|
98 |
+
self.backup_results(eval_id)
|
99 |
+
self.logger.info(f"Evaluation complete for {request.model_id}")
|
100 |
+
|
101 |
+
return results
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
self.logger.error(f"Evaluation failed for {request.model_id}: {str(e)}")
|
105 |
+
raise
|
src/core/queue_manager.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Queue management system for model evaluations."""
|
2 |
+
import logging
|
3 |
+
from typing import List, Optional
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from datetime import datetime
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
from queue import PriorityQueue
|
9 |
+
import threading
|
10 |
+
from threading import Lock
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
@dataclass(order=True)
|
15 |
+
class QueueItem:
|
16 |
+
"""Priority queue item for evaluations."""
|
17 |
+
priority: int
|
18 |
+
timestamp: datetime
|
19 |
+
request_id: str
|
20 |
+
model_id: str
|
21 |
+
revision: str
|
22 |
+
|
23 |
+
class QueueManager:
|
24 |
+
"""Manages evaluation request queue with persistence."""
|
25 |
+
|
26 |
+
def __init__(self, queue_dir: str):
|
27 |
+
self.queue_dir = queue_dir
|
28 |
+
self.queue = PriorityQueue()
|
29 |
+
self.active_evaluations: List[str] = []
|
30 |
+
self.lock = Lock()
|
31 |
+
self._load_persisted_queue()
|
32 |
+
|
33 |
+
def _load_persisted_queue(self) -> None:
|
34 |
+
"""Load persisted queue items from disk."""
|
35 |
+
try:
|
36 |
+
queue_file = os.path.join(self.queue_dir, "queue_state.json")
|
37 |
+
if os.path.exists(queue_file):
|
38 |
+
with open(queue_file, 'r') as f:
|
39 |
+
items = json.load(f)
|
40 |
+
for item in items:
|
41 |
+
self.queue.put(QueueItem(
|
42 |
+
priority=item['priority'],
|
43 |
+
timestamp=datetime.fromisoformat(item['timestamp']),
|
44 |
+
request_id=item['request_id'],
|
45 |
+
model_id=item['model_id'],
|
46 |
+
revision=item['revision']
|
47 |
+
))
|
48 |
+
logger.info(f"Loaded {self.queue.qsize()} items from persisted queue")
|
49 |
+
except Exception as e:
|
50 |
+
logger.error(f"Failed to load persisted queue: {str(e)}")
|
51 |
+
|
52 |
+
def _persist_queue(self) -> None:
|
53 |
+
"""Persist current queue state to disk."""
|
54 |
+
try:
|
55 |
+
# Create a list of all queue items
|
56 |
+
items = []
|
57 |
+
temp_queue = PriorityQueue()
|
58 |
+
|
59 |
+
while not self.queue.empty():
|
60 |
+
item = self.queue.get()
|
61 |
+
items.append({
|
62 |
+
'priority': item.priority,
|
63 |
+
'timestamp': item.timestamp.isoformat(),
|
64 |
+
'request_id': item.request_id,
|
65 |
+
'model_id': item.model_id,
|
66 |
+
'revision': item.revision
|
67 |
+
})
|
68 |
+
temp_queue.put(item)
|
69 |
+
|
70 |
+
# Restore queue
|
71 |
+
self.queue = temp_queue
|
72 |
+
|
73 |
+
# Save to disk
|
74 |
+
os.makedirs(self.queue_dir, exist_ok=True)
|
75 |
+
queue_file = os.path.join(self.queue_dir, "queue_state.json")
|
76 |
+
with open(queue_file, 'w') as f:
|
77 |
+
json.dump(items, f, indent=2)
|
78 |
+
|
79 |
+
logger.info(f"Persisted {len(items)} items to queue state")
|
80 |
+
except Exception as e:
|
81 |
+
logger.error(f"Failed to persist queue: {str(e)}")
|
82 |
+
|
83 |
+
def add_request(self, model_id: str, revision: str, priority: int = 1) -> str:
|
84 |
+
"""Add new evaluation request to queue."""
|
85 |
+
with self.lock:
|
86 |
+
request_id = f"{model_id.replace('/', '_')}_{revision}_{datetime.now().isoformat()}"
|
87 |
+
item = QueueItem(
|
88 |
+
priority=priority,
|
89 |
+
timestamp=datetime.now(),
|
90 |
+
request_id=request_id,
|
91 |
+
model_id=model_id,
|
92 |
+
revision=revision
|
93 |
+
)
|
94 |
+
self.queue.put(item)
|
95 |
+
self._persist_queue()
|
96 |
+
logger.info(f"Added request {request_id} to queue")
|
97 |
+
return request_id
|
98 |
+
|
99 |
+
def get_next_request(self) -> Optional[QueueItem]:
|
100 |
+
"""Get next request from queue."""
|
101 |
+
with self.lock:
|
102 |
+
if not self.queue.empty():
|
103 |
+
item = self.queue.get()
|
104 |
+
self.active_evaluations.append(item.request_id)
|
105 |
+
self._persist_queue()
|
106 |
+
logger.info(f"Retrieved request {item.request_id} from queue")
|
107 |
+
return item
|
108 |
+
return None
|
109 |
+
|
110 |
+
def mark_complete(self, request_id: str) -> None:
|
111 |
+
"""Mark evaluation request as complete."""
|
112 |
+
with self.lock:
|
113 |
+
if request_id in self.active_evaluations:
|
114 |
+
self.active_evaluations.remove(request_id)
|
115 |
+
logger.info(f"Marked request {request_id} as complete")
|
116 |
+
|
117 |
+
def get_queue_status(self) -> dict:
|
118 |
+
"""Get current queue status."""
|
119 |
+
with self.lock:
|
120 |
+
return {
|
121 |
+
'queued': self.queue.qsize(),
|
122 |
+
'active': len(self.active_evaluations),
|
123 |
+
'active_evaluations': self.active_evaluations
|
124 |
+
}
|
src/logging_config.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Logging configuration for the application."""
|
2 |
+
import logging
|
3 |
+
import logging.handlers
|
4 |
+
import os
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
def setup_logging(
|
8 |
+
log_dir: str = "logs",
|
9 |
+
log_level: int = logging.INFO
|
10 |
+
) -> None:
|
11 |
+
"""
|
12 |
+
Configure application-wide logging.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
log_dir: Directory to store log files
|
16 |
+
log_level: Logging level to use
|
17 |
+
"""
|
18 |
+
# Create logs directory
|
19 |
+
os.makedirs(log_dir, exist_ok=True)
|
20 |
+
|
21 |
+
# Create formatters
|
22 |
+
file_formatter = logging.Formatter(
|
23 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
24 |
+
)
|
25 |
+
console_formatter = logging.Formatter(
|
26 |
+
'%(levelname)s: %(message)s'
|
27 |
+
)
|
28 |
+
|
29 |
+
# Setup file handler with rotation
|
30 |
+
log_file = os.path.join(
|
31 |
+
log_dir,
|
32 |
+
f"leaderboard_{datetime.now().strftime('%Y%m%d')}.log"
|
33 |
+
)
|
34 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
35 |
+
log_file,
|
36 |
+
maxBytes=10485760, # 10MB
|
37 |
+
backupCount=5
|
38 |
+
)
|
39 |
+
file_handler.setFormatter(file_formatter)
|
40 |
+
|
41 |
+
# Setup console handler
|
42 |
+
console_handler = logging.StreamHandler()
|
43 |
+
console_handler.setFormatter(console_formatter)
|
44 |
+
|
45 |
+
# Setup root logger
|
46 |
+
root_logger = logging.getLogger()
|
47 |
+
root_logger.setLevel(log_level)
|
48 |
+
root_logger.addHandler(file_handler)
|
49 |
+
root_logger.addHandler(console_handler)
|
50 |
+
|
51 |
+
# Create separate loggers for different components
|
52 |
+
loggers = {
|
53 |
+
'evaluation': logging.getLogger('evaluation'),
|
54 |
+
'queue': logging.getLogger('queue'),
|
55 |
+
'web': logging.getLogger('web'),
|
56 |
+
'security': logging.getLogger('security')
|
57 |
+
}
|
58 |
+
|
59 |
+
# Configure component loggers
|
60 |
+
for name, logger in loggers.items():
|
61 |
+
logger.setLevel(log_level)
|
62 |
+
|
63 |
+
# Create component-specific file handler
|
64 |
+
component_log = os.path.join(log_dir, f"{name}.log")
|
65 |
+
handler = logging.handlers.RotatingFileHandler(
|
66 |
+
component_log,
|
67 |
+
maxBytes=5242880, # 5MB
|
68 |
+
backupCount=3
|
69 |
+
)
|
70 |
+
handler.setFormatter(file_formatter)
|
71 |
+
logger.addHandler(handler)
|
tests/test_evaluation.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tests for core evaluation functionality."""
|
2 |
+
import pytest
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from datetime import datetime
|
6 |
+
from src.core.evaluation import EvaluationManager, EvaluationRequest
|
7 |
+
|
8 |
+
@pytest.fixture
|
9 |
+
def evaluation_manager(tmp_path):
|
10 |
+
"""Create evaluation manager with temporary directories."""
|
11 |
+
results_dir = tmp_path / "results"
|
12 |
+
backup_dir = tmp_path / "backups"
|
13 |
+
return EvaluationManager(str(results_dir), str(backup_dir))
|
14 |
+
|
15 |
+
def test_evaluation_manager_init(evaluation_manager):
|
16 |
+
"""Test evaluation manager initialization."""
|
17 |
+
assert os.path.exists(evaluation_manager.results_dir)
|
18 |
+
assert os.path.exists(evaluation_manager.backup_dir)
|
19 |
+
|
20 |
+
def test_backup_results(evaluation_manager):
|
21 |
+
"""Test backup creation."""
|
22 |
+
# Create test results
|
23 |
+
eval_id = "test_model_main"
|
24 |
+
result_path = os.path.join(evaluation_manager.results_dir, f"{eval_id}.json")
|
25 |
+
test_results = {"test": "data"}
|
26 |
+
|
27 |
+
os.makedirs(os.path.dirname(result_path), exist_ok=True)
|
28 |
+
with open(result_path, 'w') as f:
|
29 |
+
json.dump(test_results, f)
|
30 |
+
|
31 |
+
# Create backup
|
32 |
+
evaluation_manager.backup_results(eval_id)
|
33 |
+
|
34 |
+
# Check backup exists
|
35 |
+
backup_files = os.listdir(evaluation_manager.backup_dir)
|
36 |
+
assert len(backup_files) == 1
|
37 |
+
assert backup_files[0].startswith(eval_id)
|
38 |
+
|
39 |
+
def test_run_evaluation(evaluation_manager):
|
40 |
+
"""Test full evaluation run."""
|
41 |
+
request = EvaluationRequest(
|
42 |
+
model_id="hf-internal-testing/tiny-random-gpt2",
|
43 |
+
revision="main",
|
44 |
+
precision="float16",
|
45 |
+
weight_type="Safetensors",
|
46 |
+
submitted_time=datetime.now()
|
47 |
+
)
|
48 |
+
|
49 |
+
results = evaluation_manager.run_evaluation(request)
|
50 |
+
|
51 |
+
assert results["model_id"] == request.model_id
|
52 |
+
assert results["revision"] == request.revision
|
53 |
+
assert "security_score" in results
|
54 |
+
assert "safetensors_compliant" in results
|
55 |
+
|
56 |
+
def test_evaluation_error_handling(evaluation_manager):
|
57 |
+
"""Test error handling during evaluation."""
|
58 |
+
request = EvaluationRequest(
|
59 |
+
model_id="invalid/model",
|
60 |
+
revision="main",
|
61 |
+
precision="float16",
|
62 |
+
weight_type="Safetensors",
|
63 |
+
submitted_time=datetime.now()
|
64 |
+
)
|
65 |
+
|
66 |
+
with pytest.raises(Exception):
|
67 |
+
evaluation_manager.run_evaluation(request)
|
68 |
+
|
69 |
+
def test_concurrent_evaluations(evaluation_manager, tmp_path):
|
70 |
+
"""Test handling of concurrent evaluations."""
|
71 |
+
import threading
|
72 |
+
import time
|
73 |
+
|
74 |
+
def run_eval(model_id):
|
75 |
+
request = EvaluationRequest(
|
76 |
+
model_id=model_id,
|
77 |
+
revision="main",
|
78 |
+
precision="float16",
|
79 |
+
weight_type="Safetensors",
|
80 |
+
submitted_time=datetime.now()
|
81 |
+
)
|
82 |
+
try:
|
83 |
+
evaluation_manager.run_evaluation(request)
|
84 |
+
except Exception:
|
85 |
+
pass
|
86 |
+
|
87 |
+
# Start multiple evaluation threads
|
88 |
+
threads = []
|
89 |
+
for i in range(3):
|
90 |
+
thread = threading.Thread(
|
91 |
+
target=run_eval,
|
92 |
+
args=(f"model_{i}",)
|
93 |
+
)
|
94 |
+
threads.append(thread)
|
95 |
+
thread.start()
|
96 |
+
|
97 |
+
# Wait for all threads to complete
|
98 |
+
for thread in threads:
|
99 |
+
thread.join()
|
100 |
+
|
101 |
+
# Check results directory integrity
|
102 |
+
assert os.path.exists(evaluation_manager.results_dir)
|
103 |
+
assert os.path.exists(evaluation_manager.backup_dir)
|
tests/test_queue.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tests for queue management system."""
|
2 |
+
import pytest
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from datetime import datetime
|
6 |
+
from src.core.queue_manager import QueueManager, QueueItem
|
7 |
+
|
8 |
+
@pytest.fixture
|
9 |
+
def queue_manager(tmp_path):
|
10 |
+
"""Create queue manager with temporary directory."""
|
11 |
+
queue_dir = tmp_path / "queue"
|
12 |
+
return QueueManager(str(queue_dir))
|
13 |
+
|
14 |
+
def test_queue_manager_init(queue_manager):
|
15 |
+
"""Test queue manager initialization."""
|
16 |
+
assert os.path.exists(queue_manager.queue_dir)
|
17 |
+
assert queue_manager.queue.empty()
|
18 |
+
assert len(queue_manager.active_evaluations) == 0
|
19 |
+
|
20 |
+
def test_add_request(queue_manager):
|
21 |
+
"""Test adding requests to queue."""
|
22 |
+
request_id = queue_manager.add_request("org/model", "main")
|
23 |
+
|
24 |
+
assert not queue_manager.queue.empty()
|
25 |
+
assert os.path.exists(os.path.join(queue_manager.queue_dir, "queue_state.json"))
|
26 |
+
|
27 |
+
# Verify persisted state
|
28 |
+
with open(os.path.join(queue_manager.queue_dir, "queue_state.json")) as f:
|
29 |
+
state = json.load(f)
|
30 |
+
assert len(state) == 1
|
31 |
+
assert state[0]["model_id"] == "org/model"
|
32 |
+
|
33 |
+
def test_get_next_request(queue_manager):
|
34 |
+
"""Test retrieving requests from queue."""
|
35 |
+
added_id = queue_manager.add_request("org/model", "main")
|
36 |
+
item = queue_manager.get_next_request()
|
37 |
+
|
38 |
+
assert item is not None
|
39 |
+
assert item.model_id == "org/model"
|
40 |
+
assert item.revision == "main"
|
41 |
+
assert item.request_id in queue_manager.active_evaluations
|
42 |
+
|
43 |
+
def test_mark_complete(queue_manager):
|
44 |
+
"""Test marking requests as complete."""
|
45 |
+
added_id = queue_manager.add_request("org/model", "main")
|
46 |
+
item = queue_manager.get_next_request()
|
47 |
+
queue_manager.mark_complete(item.request_id)
|
48 |
+
|
49 |
+
assert item.request_id not in queue_manager.active_evaluations
|
50 |
+
|
51 |
+
def test_queue_status(queue_manager):
|
52 |
+
"""Test queue status reporting."""
|
53 |
+
queue_manager.add_request("org/model1", "main")
|
54 |
+
queue_manager.add_request("org/model2", "main")
|
55 |
+
item = queue_manager.get_next_request()
|
56 |
+
|
57 |
+
status = queue_manager.get_queue_status()
|
58 |
+
assert status["queued"] == 1
|
59 |
+
assert status["active"] == 1
|
60 |
+
assert item.request_id in status["active_evaluations"]
|
61 |
+
|
62 |
+
def test_priority_ordering(queue_manager):
|
63 |
+
"""Test priority-based queue ordering."""
|
64 |
+
# Add requests with different priorities
|
65 |
+
queue_manager.add_request("org/model1", "main", priority=2)
|
66 |
+
queue_manager.add_request("org/model2", "main", priority=1) # Higher priority
|
67 |
+
queue_manager.add_request("org/model3", "main", priority=3)
|
68 |
+
|
69 |
+
# First request should be model2 (priority 1)
|
70 |
+
item = queue_manager.get_next_request()
|
71 |
+
assert item.model_id == "org/model2"
|
72 |
+
|
73 |
+
# Second should be model1 (priority 2)
|
74 |
+
item = queue_manager.get_next_request()
|
75 |
+
assert item.model_id == "org/model1"
|
76 |
+
|
77 |
+
# Third should be model3 (priority 3)
|
78 |
+
item = queue_manager.get_next_request()
|
79 |
+
assert item.model_id == "org/model3"
|
80 |
+
|
81 |
+
def test_queue_persistence(tmp_path):
|
82 |
+
"""Test queue state persistence across instances."""
|
83 |
+
queue_dir = str(tmp_path / "queue")
|
84 |
+
|
85 |
+
# Create first instance and add requests
|
86 |
+
manager1 = QueueManager(queue_dir)
|
87 |
+
manager1.add_request("org/model1", "main")
|
88 |
+
manager1.add_request("org/model2", "main")
|
89 |
+
|
90 |
+
# Create second instance and verify state loaded
|
91 |
+
manager2 = QueueManager(queue_dir)
|
92 |
+
assert manager2.queue.qsize() == 2
|
93 |
+
|
94 |
+
# Verify requests can be retrieved in correct order
|
95 |
+
item1 = manager2.get_next_request()
|
96 |
+
assert item1.model_id == "org/model1"
|
97 |
+
|
98 |
+
item2 = manager2.get_next_request()
|
99 |
+
assert item2.model_id == "org/model2"
|
100 |
+
|
101 |
+
def test_concurrent_access(queue_manager):
|
102 |
+
"""Test concurrent queue access."""
|
103 |
+
import threading
|
104 |
+
import time
|
105 |
+
|
106 |
+
def add_and_get():
|
107 |
+
# Add a request
|
108 |
+
queue_manager.add_request("org/model", "main")
|
109 |
+
time.sleep(0.1) # Simulate some work
|
110 |
+
# Try to get a request
|
111 |
+
item = queue_manager.get_next_request()
|
112 |
+
if item:
|
113 |
+
queue_manager.mark_complete(item.request_id)
|
114 |
+
|
115 |
+
# Create multiple threads
|
116 |
+
threads = []
|
117 |
+
for _ in range(5):
|
118 |
+
thread = threading.Thread(target=add_and_get)
|
119 |
+
threads.append(thread)
|
120 |
+
thread.start()
|
121 |
+
|
122 |
+
# Wait for all threads to complete
|
123 |
+
for thread in threads:
|
124 |
+
thread.join()
|
125 |
+
|
126 |
+
# Verify queue state is consistent
|
127 |
+
status = queue_manager.get_queue_status()
|
128 |
+
assert len(status["active_evaluations"]) == 0 # All should be marked complete
|