lukehinds commited on
Commit
dbdbe46
·
1 Parent(s): 5403e9d

Logging and other improvements

Browse files
app.py CHANGED
@@ -3,7 +3,13 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
  import pandas as pd
 
 
 
6
 
 
 
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
@@ -24,29 +30,72 @@ from src.display.utils import (
24
  WeightType,
25
  Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
 
 
 
 
 
 
 
 
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def restart_space():
 
 
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -57,12 +106,40 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def init_leaderboard(df):
61
  """Initialize the leaderboard with the given DataFrame."""
62
  if df is None or df.empty:
63
- # Create an empty DataFrame with the required columns
64
  df = pd.DataFrame(columns=COLS)
65
- print("Creating empty leaderboard - no evaluations completed yet")
66
 
67
  # Create the leaderboard
68
  return gr.Dataframe(
@@ -183,8 +260,27 @@ with demo:
183
 
184
  submit_button = gr.Button("Submit for Security Evaluation")
185
  submission_result = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  submit_button.click(
187
- add_new_eval,
188
  [
189
  model_name_textbox,
190
  base_model_name_textbox,
@@ -206,7 +302,11 @@ with demo:
206
  show_copy_button=True,
207
  )
208
 
 
209
  scheduler = BackgroundScheduler()
210
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
211
  scheduler.start()
 
 
212
  demo.queue(default_concurrency_limit=40).launch()
 
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
  import pandas as pd
6
+ import os
7
+ import logging
8
+ from datetime import datetime
9
 
10
+ from src.core.evaluation import EvaluationManager, EvaluationRequest
11
+ from src.core.queue_manager import QueueManager
12
+ from src.logging_config import setup_logging
13
  from src.about import (
14
  CITATION_BUTTON_LABEL,
15
  CITATION_BUTTON_TEXT,
 
30
  WeightType,
31
  Precision
32
  )
33
+ from src.envs import (
34
+ API,
35
+ CACHE_PATH,
36
+ EVAL_REQUESTS_PATH,
37
+ EVAL_RESULTS_PATH,
38
+ QUEUE_REPO,
39
+ REPO_ID,
40
+ RESULTS_REPO,
41
+ TOKEN
42
+ )
43
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
44
  from src.submission.submit import add_new_eval
45
 
46
 
47
+ # Setup logging
48
+ setup_logging(log_dir="logs")
49
+ logger = logging.getLogger('web')
50
+
51
+ # Initialize managers
52
+ evaluation_manager = EvaluationManager(
53
+ results_dir=EVAL_RESULTS_PATH,
54
+ backup_dir=os.path.join(CACHE_PATH, "eval-backups")
55
+ )
56
+
57
+ queue_manager = QueueManager(
58
+ queue_dir=os.path.join(CACHE_PATH, "eval-queue")
59
+ )
60
+
61
  def restart_space():
62
+ """Restart the Hugging Face space."""
63
+ logger.info("Restarting space")
64
  API.restart_space(repo_id=REPO_ID)
65
 
66
+ def initialize_space():
67
+ """Initialize the space by downloading required data."""
68
+ logger.info("Initializing space")
69
+ try:
70
+ logger.info(f"Downloading queue data from {QUEUE_REPO}")
71
+ snapshot_download(
72
+ repo_id=QUEUE_REPO,
73
+ local_dir=EVAL_REQUESTS_PATH,
74
+ repo_type="dataset",
75
+ tqdm_class=None,
76
+ etag_timeout=30,
77
+ token=TOKEN
78
+ )
79
+ except Exception as e:
80
+ logger.error(f"Failed to download queue data: {str(e)}")
81
+ restart_space()
82
+
83
+ try:
84
+ logger.info(f"Downloading results data from {RESULTS_REPO}")
85
+ snapshot_download(
86
+ repo_id=RESULTS_REPO,
87
+ local_dir=EVAL_RESULTS_PATH,
88
+ repo_type="dataset",
89
+ tqdm_class=None,
90
+ etag_timeout=30,
91
+ token=TOKEN
92
+ )
93
+ except Exception as e:
94
+ logger.error(f"Failed to download results data: {str(e)}")
95
+ restart_space()
96
+
97
+ # Initialize space
98
+ initialize_space()
99
 
100
 
101
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
106
  pending_eval_queue_df,
107
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
108
 
109
+ def process_evaluation_queue():
110
+ """Process pending evaluation requests."""
111
+ logger.info("Processing evaluation queue")
112
+ while True:
113
+ request = queue_manager.get_next_request()
114
+ if not request:
115
+ break
116
+
117
+ try:
118
+ # Convert queue request to evaluation request
119
+ eval_request = EvaluationRequest(
120
+ model_id=request.model_id,
121
+ revision=request.revision,
122
+ precision="float16", # Default precision
123
+ weight_type="Safetensors",
124
+ submitted_time=request.timestamp
125
+ )
126
+
127
+ # Run evaluation
128
+ results = evaluation_manager.run_evaluation(eval_request)
129
+ logger.info(f"Evaluation complete for {request.model_id}")
130
+
131
+ # Mark request as complete
132
+ queue_manager.mark_complete(request.request_id)
133
+
134
+ except Exception as e:
135
+ logger.error(f"Evaluation failed for {request.model_id}: {str(e)}")
136
+ # Keep request in active queue for retry
137
+
138
  def init_leaderboard(df):
139
  """Initialize the leaderboard with the given DataFrame."""
140
  if df is None or df.empty:
 
141
  df = pd.DataFrame(columns=COLS)
142
+ logger.info("Creating empty leaderboard - no evaluations completed yet")
143
 
144
  # Create the leaderboard
145
  return gr.Dataframe(
 
260
 
261
  submit_button = gr.Button("Submit for Security Evaluation")
262
  submission_result = gr.Markdown()
263
+ def handle_submission(model, base_model, revision, precision, weight_type, model_type):
264
+ """Handle new model submission."""
265
+ try:
266
+ logger.info(f"New submission received for {model}")
267
+
268
+ # Add to queue
269
+ request_id = queue_manager.add_request(
270
+ model_id=model,
271
+ revision=revision if revision else "main"
272
+ )
273
+
274
+ # Process queue
275
+ process_evaluation_queue()
276
+
277
+ return gr.Markdown("Submission successful! Your model has been added to the evaluation queue.")
278
+ except Exception as e:
279
+ logger.error(f"Submission failed: {str(e)}")
280
+ return gr.Markdown(f"Error: {str(e)}")
281
+
282
  submit_button.click(
283
+ handle_submission,
284
  [
285
  model_name_textbox,
286
  base_model_name_textbox,
 
302
  show_copy_button=True,
303
  )
304
 
305
+ # Setup schedulers
306
  scheduler = BackgroundScheduler()
307
  scheduler.add_job(restart_space, "interval", seconds=1800)
308
+ scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes
309
  scheduler.start()
310
+
311
+ logger.info("Application startup complete")
312
  demo.queue(default_concurrency_limit=40).launch()
src/core/evaluation.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core evaluation logic separated from web interface."""
2
+ import logging
3
+ from typing import Dict, Any, Optional
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ import json
7
+ import os
8
+
9
+ from src.leaderboard.security_eval import (
10
+ check_safetensors,
11
+ evaluate_secure_coding,
12
+ load_model_and_tokenizer,
13
+ )
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.FileHandler('evaluation.log'),
21
+ logging.StreamHandler()
22
+ ]
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+ @dataclass
27
+ class EvaluationRequest:
28
+ """Data class for evaluation requests."""
29
+ model_id: str
30
+ revision: str
31
+ precision: str
32
+ weight_type: str
33
+ submitted_time: datetime
34
+ base_model: Optional[str] = None
35
+
36
+ class EvaluationManager:
37
+ """Manages the evaluation pipeline and state."""
38
+
39
+ def __init__(self, results_dir: str, backup_dir: str):
40
+ self.results_dir = results_dir
41
+ self.backup_dir = backup_dir
42
+ self._ensure_directories()
43
+ self.logger = logging.getLogger(__name__)
44
+
45
+ def _ensure_directories(self) -> None:
46
+ """Ensure required directories exist."""
47
+ os.makedirs(self.results_dir, exist_ok=True)
48
+ os.makedirs(self.backup_dir, exist_ok=True)
49
+
50
+ def backup_results(self, eval_id: str) -> None:
51
+ """Create backup of evaluation results."""
52
+ try:
53
+ result_path = os.path.join(self.results_dir, f"{eval_id}.json")
54
+ backup_path = os.path.join(self.backup_dir, f"{eval_id}_{datetime.now().isoformat()}.json")
55
+
56
+ if os.path.exists(result_path):
57
+ with open(result_path, 'r') as src, open(backup_path, 'w') as dst:
58
+ json.dump(json.load(src), dst, indent=2)
59
+ self.logger.info(f"Created backup at {backup_path}")
60
+ except Exception as e:
61
+ self.logger.error(f"Backup failed for {eval_id}: {str(e)}")
62
+
63
+ def run_evaluation(self, request: EvaluationRequest) -> Dict[str, Any]:
64
+ """Run evaluation pipeline for a model."""
65
+ try:
66
+ self.logger.info(f"Starting evaluation for {request.model_id}")
67
+
68
+ # Run security checks
69
+ safetensors_compliant = check_safetensors(request.model_id, request.revision)
70
+ self.logger.info(f"Safetensors check: {safetensors_compliant}")
71
+
72
+ # Load model for evaluation
73
+ model, tokenizer = load_model_and_tokenizer(request.model_id, request.revision)
74
+ self.logger.info("Model loaded successfully")
75
+
76
+ # Run security evaluation
77
+ security_score = evaluate_secure_coding(request.model_id, request.revision)
78
+ self.logger.info(f"Security evaluation complete. Score: {security_score}")
79
+
80
+ # Compile results
81
+ results = {
82
+ "model_id": request.model_id,
83
+ "revision": request.revision,
84
+ "evaluation_time": datetime.now().isoformat(),
85
+ "safetensors_compliant": safetensors_compliant,
86
+ "security_score": security_score,
87
+ "precision": request.precision,
88
+ "weight_type": request.weight_type,
89
+ }
90
+
91
+ # Save and backup results
92
+ eval_id = f"{request.model_id.replace('/', '_')}_{request.revision}"
93
+ result_path = os.path.join(self.results_dir, f"{eval_id}.json")
94
+
95
+ with open(result_path, 'w') as f:
96
+ json.dump(results, f, indent=2)
97
+
98
+ self.backup_results(eval_id)
99
+ self.logger.info(f"Evaluation complete for {request.model_id}")
100
+
101
+ return results
102
+
103
+ except Exception as e:
104
+ self.logger.error(f"Evaluation failed for {request.model_id}: {str(e)}")
105
+ raise
src/core/queue_manager.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Queue management system for model evaluations."""
2
+ import logging
3
+ from typing import List, Optional
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ import json
7
+ import os
8
+ from queue import PriorityQueue
9
+ import threading
10
+ from threading import Lock
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ @dataclass(order=True)
15
+ class QueueItem:
16
+ """Priority queue item for evaluations."""
17
+ priority: int
18
+ timestamp: datetime
19
+ request_id: str
20
+ model_id: str
21
+ revision: str
22
+
23
+ class QueueManager:
24
+ """Manages evaluation request queue with persistence."""
25
+
26
+ def __init__(self, queue_dir: str):
27
+ self.queue_dir = queue_dir
28
+ self.queue = PriorityQueue()
29
+ self.active_evaluations: List[str] = []
30
+ self.lock = Lock()
31
+ self._load_persisted_queue()
32
+
33
+ def _load_persisted_queue(self) -> None:
34
+ """Load persisted queue items from disk."""
35
+ try:
36
+ queue_file = os.path.join(self.queue_dir, "queue_state.json")
37
+ if os.path.exists(queue_file):
38
+ with open(queue_file, 'r') as f:
39
+ items = json.load(f)
40
+ for item in items:
41
+ self.queue.put(QueueItem(
42
+ priority=item['priority'],
43
+ timestamp=datetime.fromisoformat(item['timestamp']),
44
+ request_id=item['request_id'],
45
+ model_id=item['model_id'],
46
+ revision=item['revision']
47
+ ))
48
+ logger.info(f"Loaded {self.queue.qsize()} items from persisted queue")
49
+ except Exception as e:
50
+ logger.error(f"Failed to load persisted queue: {str(e)}")
51
+
52
+ def _persist_queue(self) -> None:
53
+ """Persist current queue state to disk."""
54
+ try:
55
+ # Create a list of all queue items
56
+ items = []
57
+ temp_queue = PriorityQueue()
58
+
59
+ while not self.queue.empty():
60
+ item = self.queue.get()
61
+ items.append({
62
+ 'priority': item.priority,
63
+ 'timestamp': item.timestamp.isoformat(),
64
+ 'request_id': item.request_id,
65
+ 'model_id': item.model_id,
66
+ 'revision': item.revision
67
+ })
68
+ temp_queue.put(item)
69
+
70
+ # Restore queue
71
+ self.queue = temp_queue
72
+
73
+ # Save to disk
74
+ os.makedirs(self.queue_dir, exist_ok=True)
75
+ queue_file = os.path.join(self.queue_dir, "queue_state.json")
76
+ with open(queue_file, 'w') as f:
77
+ json.dump(items, f, indent=2)
78
+
79
+ logger.info(f"Persisted {len(items)} items to queue state")
80
+ except Exception as e:
81
+ logger.error(f"Failed to persist queue: {str(e)}")
82
+
83
+ def add_request(self, model_id: str, revision: str, priority: int = 1) -> str:
84
+ """Add new evaluation request to queue."""
85
+ with self.lock:
86
+ request_id = f"{model_id.replace('/', '_')}_{revision}_{datetime.now().isoformat()}"
87
+ item = QueueItem(
88
+ priority=priority,
89
+ timestamp=datetime.now(),
90
+ request_id=request_id,
91
+ model_id=model_id,
92
+ revision=revision
93
+ )
94
+ self.queue.put(item)
95
+ self._persist_queue()
96
+ logger.info(f"Added request {request_id} to queue")
97
+ return request_id
98
+
99
+ def get_next_request(self) -> Optional[QueueItem]:
100
+ """Get next request from queue."""
101
+ with self.lock:
102
+ if not self.queue.empty():
103
+ item = self.queue.get()
104
+ self.active_evaluations.append(item.request_id)
105
+ self._persist_queue()
106
+ logger.info(f"Retrieved request {item.request_id} from queue")
107
+ return item
108
+ return None
109
+
110
+ def mark_complete(self, request_id: str) -> None:
111
+ """Mark evaluation request as complete."""
112
+ with self.lock:
113
+ if request_id in self.active_evaluations:
114
+ self.active_evaluations.remove(request_id)
115
+ logger.info(f"Marked request {request_id} as complete")
116
+
117
+ def get_queue_status(self) -> dict:
118
+ """Get current queue status."""
119
+ with self.lock:
120
+ return {
121
+ 'queued': self.queue.qsize(),
122
+ 'active': len(self.active_evaluations),
123
+ 'active_evaluations': self.active_evaluations
124
+ }
src/logging_config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logging configuration for the application."""
2
+ import logging
3
+ import logging.handlers
4
+ import os
5
+ from datetime import datetime
6
+
7
+ def setup_logging(
8
+ log_dir: str = "logs",
9
+ log_level: int = logging.INFO
10
+ ) -> None:
11
+ """
12
+ Configure application-wide logging.
13
+
14
+ Args:
15
+ log_dir: Directory to store log files
16
+ log_level: Logging level to use
17
+ """
18
+ # Create logs directory
19
+ os.makedirs(log_dir, exist_ok=True)
20
+
21
+ # Create formatters
22
+ file_formatter = logging.Formatter(
23
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
24
+ )
25
+ console_formatter = logging.Formatter(
26
+ '%(levelname)s: %(message)s'
27
+ )
28
+
29
+ # Setup file handler with rotation
30
+ log_file = os.path.join(
31
+ log_dir,
32
+ f"leaderboard_{datetime.now().strftime('%Y%m%d')}.log"
33
+ )
34
+ file_handler = logging.handlers.RotatingFileHandler(
35
+ log_file,
36
+ maxBytes=10485760, # 10MB
37
+ backupCount=5
38
+ )
39
+ file_handler.setFormatter(file_formatter)
40
+
41
+ # Setup console handler
42
+ console_handler = logging.StreamHandler()
43
+ console_handler.setFormatter(console_formatter)
44
+
45
+ # Setup root logger
46
+ root_logger = logging.getLogger()
47
+ root_logger.setLevel(log_level)
48
+ root_logger.addHandler(file_handler)
49
+ root_logger.addHandler(console_handler)
50
+
51
+ # Create separate loggers for different components
52
+ loggers = {
53
+ 'evaluation': logging.getLogger('evaluation'),
54
+ 'queue': logging.getLogger('queue'),
55
+ 'web': logging.getLogger('web'),
56
+ 'security': logging.getLogger('security')
57
+ }
58
+
59
+ # Configure component loggers
60
+ for name, logger in loggers.items():
61
+ logger.setLevel(log_level)
62
+
63
+ # Create component-specific file handler
64
+ component_log = os.path.join(log_dir, f"{name}.log")
65
+ handler = logging.handlers.RotatingFileHandler(
66
+ component_log,
67
+ maxBytes=5242880, # 5MB
68
+ backupCount=3
69
+ )
70
+ handler.setFormatter(file_formatter)
71
+ logger.addHandler(handler)
tests/test_evaluation.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for core evaluation functionality."""
2
+ import pytest
3
+ import os
4
+ import json
5
+ from datetime import datetime
6
+ from src.core.evaluation import EvaluationManager, EvaluationRequest
7
+
8
+ @pytest.fixture
9
+ def evaluation_manager(tmp_path):
10
+ """Create evaluation manager with temporary directories."""
11
+ results_dir = tmp_path / "results"
12
+ backup_dir = tmp_path / "backups"
13
+ return EvaluationManager(str(results_dir), str(backup_dir))
14
+
15
+ def test_evaluation_manager_init(evaluation_manager):
16
+ """Test evaluation manager initialization."""
17
+ assert os.path.exists(evaluation_manager.results_dir)
18
+ assert os.path.exists(evaluation_manager.backup_dir)
19
+
20
+ def test_backup_results(evaluation_manager):
21
+ """Test backup creation."""
22
+ # Create test results
23
+ eval_id = "test_model_main"
24
+ result_path = os.path.join(evaluation_manager.results_dir, f"{eval_id}.json")
25
+ test_results = {"test": "data"}
26
+
27
+ os.makedirs(os.path.dirname(result_path), exist_ok=True)
28
+ with open(result_path, 'w') as f:
29
+ json.dump(test_results, f)
30
+
31
+ # Create backup
32
+ evaluation_manager.backup_results(eval_id)
33
+
34
+ # Check backup exists
35
+ backup_files = os.listdir(evaluation_manager.backup_dir)
36
+ assert len(backup_files) == 1
37
+ assert backup_files[0].startswith(eval_id)
38
+
39
+ def test_run_evaluation(evaluation_manager):
40
+ """Test full evaluation run."""
41
+ request = EvaluationRequest(
42
+ model_id="hf-internal-testing/tiny-random-gpt2",
43
+ revision="main",
44
+ precision="float16",
45
+ weight_type="Safetensors",
46
+ submitted_time=datetime.now()
47
+ )
48
+
49
+ results = evaluation_manager.run_evaluation(request)
50
+
51
+ assert results["model_id"] == request.model_id
52
+ assert results["revision"] == request.revision
53
+ assert "security_score" in results
54
+ assert "safetensors_compliant" in results
55
+
56
+ def test_evaluation_error_handling(evaluation_manager):
57
+ """Test error handling during evaluation."""
58
+ request = EvaluationRequest(
59
+ model_id="invalid/model",
60
+ revision="main",
61
+ precision="float16",
62
+ weight_type="Safetensors",
63
+ submitted_time=datetime.now()
64
+ )
65
+
66
+ with pytest.raises(Exception):
67
+ evaluation_manager.run_evaluation(request)
68
+
69
+ def test_concurrent_evaluations(evaluation_manager, tmp_path):
70
+ """Test handling of concurrent evaluations."""
71
+ import threading
72
+ import time
73
+
74
+ def run_eval(model_id):
75
+ request = EvaluationRequest(
76
+ model_id=model_id,
77
+ revision="main",
78
+ precision="float16",
79
+ weight_type="Safetensors",
80
+ submitted_time=datetime.now()
81
+ )
82
+ try:
83
+ evaluation_manager.run_evaluation(request)
84
+ except Exception:
85
+ pass
86
+
87
+ # Start multiple evaluation threads
88
+ threads = []
89
+ for i in range(3):
90
+ thread = threading.Thread(
91
+ target=run_eval,
92
+ args=(f"model_{i}",)
93
+ )
94
+ threads.append(thread)
95
+ thread.start()
96
+
97
+ # Wait for all threads to complete
98
+ for thread in threads:
99
+ thread.join()
100
+
101
+ # Check results directory integrity
102
+ assert os.path.exists(evaluation_manager.results_dir)
103
+ assert os.path.exists(evaluation_manager.backup_dir)
tests/test_queue.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for queue management system."""
2
+ import pytest
3
+ import os
4
+ import json
5
+ from datetime import datetime
6
+ from src.core.queue_manager import QueueManager, QueueItem
7
+
8
+ @pytest.fixture
9
+ def queue_manager(tmp_path):
10
+ """Create queue manager with temporary directory."""
11
+ queue_dir = tmp_path / "queue"
12
+ return QueueManager(str(queue_dir))
13
+
14
+ def test_queue_manager_init(queue_manager):
15
+ """Test queue manager initialization."""
16
+ assert os.path.exists(queue_manager.queue_dir)
17
+ assert queue_manager.queue.empty()
18
+ assert len(queue_manager.active_evaluations) == 0
19
+
20
+ def test_add_request(queue_manager):
21
+ """Test adding requests to queue."""
22
+ request_id = queue_manager.add_request("org/model", "main")
23
+
24
+ assert not queue_manager.queue.empty()
25
+ assert os.path.exists(os.path.join(queue_manager.queue_dir, "queue_state.json"))
26
+
27
+ # Verify persisted state
28
+ with open(os.path.join(queue_manager.queue_dir, "queue_state.json")) as f:
29
+ state = json.load(f)
30
+ assert len(state) == 1
31
+ assert state[0]["model_id"] == "org/model"
32
+
33
+ def test_get_next_request(queue_manager):
34
+ """Test retrieving requests from queue."""
35
+ added_id = queue_manager.add_request("org/model", "main")
36
+ item = queue_manager.get_next_request()
37
+
38
+ assert item is not None
39
+ assert item.model_id == "org/model"
40
+ assert item.revision == "main"
41
+ assert item.request_id in queue_manager.active_evaluations
42
+
43
+ def test_mark_complete(queue_manager):
44
+ """Test marking requests as complete."""
45
+ added_id = queue_manager.add_request("org/model", "main")
46
+ item = queue_manager.get_next_request()
47
+ queue_manager.mark_complete(item.request_id)
48
+
49
+ assert item.request_id not in queue_manager.active_evaluations
50
+
51
+ def test_queue_status(queue_manager):
52
+ """Test queue status reporting."""
53
+ queue_manager.add_request("org/model1", "main")
54
+ queue_manager.add_request("org/model2", "main")
55
+ item = queue_manager.get_next_request()
56
+
57
+ status = queue_manager.get_queue_status()
58
+ assert status["queued"] == 1
59
+ assert status["active"] == 1
60
+ assert item.request_id in status["active_evaluations"]
61
+
62
+ def test_priority_ordering(queue_manager):
63
+ """Test priority-based queue ordering."""
64
+ # Add requests with different priorities
65
+ queue_manager.add_request("org/model1", "main", priority=2)
66
+ queue_manager.add_request("org/model2", "main", priority=1) # Higher priority
67
+ queue_manager.add_request("org/model3", "main", priority=3)
68
+
69
+ # First request should be model2 (priority 1)
70
+ item = queue_manager.get_next_request()
71
+ assert item.model_id == "org/model2"
72
+
73
+ # Second should be model1 (priority 2)
74
+ item = queue_manager.get_next_request()
75
+ assert item.model_id == "org/model1"
76
+
77
+ # Third should be model3 (priority 3)
78
+ item = queue_manager.get_next_request()
79
+ assert item.model_id == "org/model3"
80
+
81
+ def test_queue_persistence(tmp_path):
82
+ """Test queue state persistence across instances."""
83
+ queue_dir = str(tmp_path / "queue")
84
+
85
+ # Create first instance and add requests
86
+ manager1 = QueueManager(queue_dir)
87
+ manager1.add_request("org/model1", "main")
88
+ manager1.add_request("org/model2", "main")
89
+
90
+ # Create second instance and verify state loaded
91
+ manager2 = QueueManager(queue_dir)
92
+ assert manager2.queue.qsize() == 2
93
+
94
+ # Verify requests can be retrieved in correct order
95
+ item1 = manager2.get_next_request()
96
+ assert item1.model_id == "org/model1"
97
+
98
+ item2 = manager2.get_next_request()
99
+ assert item2.model_id == "org/model2"
100
+
101
+ def test_concurrent_access(queue_manager):
102
+ """Test concurrent queue access."""
103
+ import threading
104
+ import time
105
+
106
+ def add_and_get():
107
+ # Add a request
108
+ queue_manager.add_request("org/model", "main")
109
+ time.sleep(0.1) # Simulate some work
110
+ # Try to get a request
111
+ item = queue_manager.get_next_request()
112
+ if item:
113
+ queue_manager.mark_complete(item.request_id)
114
+
115
+ # Create multiple threads
116
+ threads = []
117
+ for _ in range(5):
118
+ thread = threading.Thread(target=add_and_get)
119
+ threads.append(thread)
120
+ thread.start()
121
+
122
+ # Wait for all threads to complete
123
+ for thread in threads:
124
+ thread.join()
125
+
126
+ # Verify queue state is consistent
127
+ status = queue_manager.get_queue_status()
128
+ assert len(status["active_evaluations"]) == 0 # All should be marked complete