File size: 31,145 Bytes
df2b222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
"""

Warm Sandbox Pool for Modal - Async Queue-Based Implementation

This module provides a pre-warmed pool of Modal sandboxes to reduce cold-start latency.

"""
import asyncio
import time
from typing import Optional, Dict, Any
from contextlib import asynccontextmanager
from dataclasses import dataclass
from enum import Enum

import modal

from mcp_hub.logging_config import logger
from mcp_hub.exceptions import CodeExecutionError


class SandboxHealth(Enum):
    """Sandbox health status."""
    HEALTHY = "healthy"
    UNHEALTHY = "unhealthy"
    UNKNOWN = "unknown"


@dataclass
class PooledSandbox:
    """Container for a pooled sandbox with metadata."""
    sandbox: modal.Sandbox
    created_at: float
    last_used: float
    health: SandboxHealth = SandboxHealth.UNKNOWN
    use_count: int = 0


class WarmSandboxPool:
    """Async queue-based warm sandbox pool with health checking."""
    
    def __init__(

        self,

        app: modal.App,

        image: modal.Image,

        pool_size: int = 3,

        max_age_seconds: int = 300,  # 5 minutes

        max_uses_per_sandbox: int = 10,

        health_check_interval: int = 60,  # 1 minute

    ):
        self.app = app
        self.image = image
        self.pool_size = pool_size
        self.max_age_seconds = max_age_seconds
        self.max_uses_per_sandbox = max_uses_per_sandbox
        self.health_check_interval = health_check_interval
        
        # Queue to hold available sandboxes
        self._sandbox_queue: asyncio.Queue[PooledSandbox] = asyncio.Queue(maxsize=pool_size)
        
        # Background tasks
        self._warmup_task: Optional[asyncio.Task] = None
        self._health_check_task: Optional[asyncio.Task] = None
        self._cleanup_task: Optional[asyncio.Task] = None
        
        # Pool statistics
        self._stats = {
            "created": 0,
            "reused": 0,
            "recycled": 0,
            "health_checks": 0,
            "failures": 0
        }
        
        # Health tracking for better error recovery
        self._consecutive_failures = 0
        self._last_successful_creation = time.time()
        self._pool_reset_threshold = 5  # Reset pool after 5 consecutive failures
        
        self._running = False
        
    async def start(self):
        """Start the pool and background tasks."""
        if self._running:
            return
            
        self._running = True
        logger.info(f"Starting warm sandbox pool with {self.pool_size} sandboxes")
        
        # Start background tasks
        self._warmup_task = asyncio.create_task(self._warmup_pool())
        self._health_check_task = asyncio.create_task(self._health_check_loop())
        self._cleanup_task = asyncio.create_task(self._cleanup_loop())
        
        # Wait for initial warmup
        await asyncio.sleep(1)  # Give warmup a moment to start
        
    async def stop(self):
        """Stop the pool and cleanup resources."""
        if not self._running:
            return
            
        self._running = False
        logger.info("Stopping warm sandbox pool")
        
        # Cancel background tasks
        for task in [self._warmup_task, self._health_check_task, self._cleanup_task]:
            if task and not task.done():
                task.cancel()
                try:
                    await task
                except asyncio.CancelledError:
                    pass
          # Cleanup remaining sandboxes
        while not self._sandbox_queue.empty():
            try:
                pooled_sb = self._sandbox_queue.get_nowait()
                await self._terminate_sandbox(pooled_sb.sandbox)
            except asyncio.QueueEmpty:
                break
                
    @asynccontextmanager
    async def get_sandbox(self, timeout: float = 5.0):
        pooled_sb = None
        created_new = False
        try:
            # Check if we need to reset the pool due to consecutive failures
            if self._consecutive_failures >= self._pool_reset_threshold:
                logger.warning(f"Pool has {self._consecutive_failures} consecutive failures, attempting reset")
                await self._emergency_pool_reset()
            
            # Try to get a warm sandbox from the pool, retry if not alive
            max_retries = 3  # Increased retries for better reliability
            for attempt in range(max_retries):
                try:
                    # Try to get from pool first
                    pooled_sb = await asyncio.wait_for(self._sandbox_queue.get(), timeout=timeout)
                    # Check if the sandbox is alive
                    alive = await self._is_sandbox_alive(pooled_sb.sandbox)
                    if not alive:
                        logger.info(f"Got dead sandbox from pool on attempt {attempt + 1}, terminating and trying next.")
                        await self._terminate_sandbox(pooled_sb.sandbox)
                        pooled_sb = None
                        continue  # Try again
                    
                    # Sandbox is alive, use it
                    pooled_sb.last_used = time.time()
                    pooled_sb.use_count += 1
                    self._stats["reused"] += 1
                    self._consecutive_failures = 0  # Reset failure counter on success
                    break
                    
                except asyncio.TimeoutError:
                    # Pool empty or taking too long, create a new one
                    logger.info(f"Pool timeout on attempt {attempt + 1}, creating new sandbox")
                    try:
                        sandbox = await self._create_sandbox()
                        pooled_sb = PooledSandbox(
                            sandbox=sandbox,
                            created_at=time.time(),
                            last_used=time.time(),
                            use_count=1
                        )
                        created_new = True
                        self._stats["created"] += 1
                        self._consecutive_failures = 0  # Reset failure counter on success
                        self._last_successful_creation = time.time()
                        break
                    except Exception as create_error:
                        logger.error(f"Failed to create sandbox on attempt {attempt + 1}: {create_error}")
                        self._consecutive_failures += 1
                        if attempt == max_retries - 1:  # Last attempt
                            raise CodeExecutionError(f"Failed to create sandbox after {max_retries} attempts: {create_error}")
                        await asyncio.sleep(2 ** attempt)  # Exponential backoff
            else:
                self._consecutive_failures += 1
                raise CodeExecutionError("Could not obtain a live sandbox from the pool after all retry attempts.")
            
            logger.info(f"Yielding sandbox of type from sandbox_pool: {type(pooled_sb.sandbox)}")    
            yield pooled_sb.sandbox
            
        except Exception as e:
            logger.error(f"Error getting sandbox: {e}")
            self._stats["failures"] += 1
            self._consecutive_failures += 1
            raise CodeExecutionError(f"Failed to get sandbox: {e}")        
        finally:
            if pooled_sb:
                should_recycle = (
                    not created_new and
                    self._should_recycle_sandbox(pooled_sb) and
                    self._running
                )
                if should_recycle:
                    # Double-check sandbox is alive and functional before returning to pool
                    if await self._is_sandbox_alive(pooled_sb.sandbox):
                        # Additional check: try a quick execution to ensure sandbox is fully functional
                        try:
                            await asyncio.wait_for(
                                asyncio.get_event_loop().run_in_executor(
                                    None,
                                    lambda: pooled_sb.sandbox.exec("python", "-c", "import sys; print('ready')", timeout=2)
                                ),
                                timeout=3.0
                            )
                            
                            # Sandbox is healthy and functional - return to pool
                            try:
                                self._sandbox_queue.put_nowait(pooled_sb)
                                logger.debug("Returned healthy sandbox to pool")
                            except asyncio.QueueFull:
                                # Pool is full - terminate excess sandbox
                                await self._terminate_sandbox(pooled_sb.sandbox)
                                logger.debug("Pool full, terminated excess sandbox")
                        except Exception as e:
                            # Sandbox failed functional test - terminate it
                            logger.debug(f"Sandbox failed functional test, terminating: {e}")
                            await self._terminate_sandbox(pooled_sb.sandbox)
                    else:
                        # Sandbox is dead - terminate it
                        logger.debug("Sandbox is dead, terminating instead of recycling")
                        await self._terminate_sandbox(pooled_sb.sandbox)
                else:
                    # Should not recycle - terminate sandbox
                    await self._terminate_sandbox(pooled_sb.sandbox)
                    if not created_new:
                        self._stats["recycled"] += 1
                        logger.debug("Terminated sandbox (exceeded recycle criteria)")
    
    async def _create_sandbox(self) -> modal.Sandbox:
        """Create a new Modal sandbox with timeout protection."""
        try:
            # Add timeout protection for sandbox creation
            sandbox_creation = asyncio.get_event_loop().run_in_executor(
                None,
                lambda: modal.Sandbox.create(
                    app=self.app,
                    image=self.image,
                    cpu=2.0,
                    memory=1024,
                    timeout=35
                )
            )
              # Wait for sandbox creation with timeout
            sandbox = await asyncio.wait_for(sandbox_creation, timeout=120)  # 2 minute timeout
            logger.debug(f"Created new sandbox of type: {type(sandbox)}")
            return sandbox
        except asyncio.TimeoutError:
            logger.error("Sandbox creation timed out after 2 minutes")
            raise Exception("Sandbox creation timed out - Modal may be experiencing issues")
        except Exception as e:
            logger.error(f"Failed to create sandbox: {e}")
            raise
    
    async def _terminate_sandbox(self, sandbox: modal.Sandbox):
        """Safely terminate a sandbox with better error handling."""
        try:
            # Check if sandbox is still responsive before termination
            if hasattr(sandbox, '_terminated') and sandbox._terminated:
                logger.debug("Sandbox already terminated")
                return
                
            # Use asyncio timeout for termination
            await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(None, sandbox.terminate),
                timeout=10.0  # 10 second timeout for termination
            )
            logger.debug("Terminated sandbox successfully")
        except asyncio.TimeoutError:
            logger.warning("Sandbox termination timed out - may be unresponsive")
        except Exception as e:
            # Log the error but don't fail - sandbox may already be dead
            logger.warning(f"Failed to terminate sandbox (may already be dead): {e}")
            # Mark sandbox as terminated to avoid repeated attempts
            if hasattr(sandbox, '_terminated'):
                sandbox._terminated = True
    
    def _should_recycle_sandbox(self, pooled_sb: PooledSandbox) -> bool:
        """Determine if a sandbox should be recycled back to the pool."""
        now = time.time()
        
        # Check age
        if now - pooled_sb.created_at > self.max_age_seconds:
            logger.debug("Sandbox too old, not recycling")
            return False
            
        # Check usage count
        if pooled_sb.use_count >= self.max_uses_per_sandbox:
            logger.debug("Sandbox used too many times, not recycling")
            return False
            
        # Check health (if we've checked it)
        if pooled_sb.health == SandboxHealth.UNHEALTHY:
            logger.debug("Sandbox unhealthy, not recycling")
            return False
            
        return True
    async def _warmup_pool(self):
        """Background task to maintain warm sandboxes in the pool with aggressive replenishment."""
        while self._running:
            try:
                current_size = self._sandbox_queue.qsize()
                
                # More aggressive warmup - start warming when below 90% capacity
                warmup_threshold = max(1, int(self.pool_size * 0.9))
                
                if current_size < warmup_threshold:
                    needed = self.pool_size - current_size
                    logger.info(f"Pool size ({current_size}) below threshold ({warmup_threshold}). Warming {needed} sandboxes...")
                    
                    # Create new sandboxes to fill the pool - but limit concurrent creation
                    max_concurrent = min(needed, 2)  # Don't overwhelm Modal
                    tasks = []
                    for _ in range(max_concurrent):
                        task = asyncio.create_task(self._create_and_queue_sandbox())
                        tasks.append(task)
                    
                    if tasks:
                        results = await asyncio.gather(*tasks, return_exceptions=True)
                        # Log any failures
                        successful = 0
                        for i, result in enumerate(results):
                            if isinstance(result, Exception):
                                logger.warning(f"Failed to create sandbox {i+1}/{max_concurrent}: {result}")
                            else:
                                successful += 1
                        
                        if successful > 0:
                            logger.info(f"Successfully warmed {successful}/{max_concurrent} sandboxes")
                
                # Adaptive sleep interval based on pool health
                if current_size == 0:
                    # Critical: no sandboxes available
                    sleep_interval = 1
                elif current_size < warmup_threshold:
                    # Low: need more sandboxes
                    sleep_interval = 2
                else:
                    # Healthy: normal monitoring
                    sleep_interval = 5
                    
                await asyncio.sleep(sleep_interval)
                
            except Exception as e:
                logger.error(f"Error in warmup loop: {e}")
                await asyncio.sleep(10)  # Wait longer on error
    
    async def _create_and_queue_sandbox(self):
        """Create a sandbox and add it to the queue."""
        start_time = time.time()
        try:
            # Create the sandbox
            sandbox = await self._create_sandbox()
            creation_time = time.time() - start_time
            logger.info(f"Sandbox creation took {creation_time:.2f}s")
            
            # Proactively warm up the sandbox with core imports
            warmup_start = time.time()
            await self._warmup_sandbox_imports(sandbox)
            warmup_time = time.time() - warmup_start
            logger.info(f"Sandbox warmup with imports took {warmup_time:.2f}s")
            
            pooled_sb = PooledSandbox(
                sandbox=sandbox,
                created_at=time.time(),
                last_used=time.time()
            )
            
            try:
                self._sandbox_queue.put_nowait(pooled_sb)
                total_time = time.time() - start_time
                logger.info(f"Added warm sandbox to pool (total time: {total_time:.2f}s)")
            except asyncio.QueueFull:
                # Pool is full, terminate this sandbox
                await self._terminate_sandbox(sandbox)
                
        except Exception as e:
            total_time = time.time() - start_time
            logger.error(f"Failed to create and queue sandbox after {total_time:.2f}s: {e}")

    async def _warmup_sandbox_imports(self, sandbox: modal.Sandbox):
        """Warm up sandbox by importing core packages."""
        try:
            from mcp_hub.package_utils import get_warmup_import_commands
            
            # Get warmup commands
            import_commands = get_warmup_import_commands()
            warmup_script = "; ".join(import_commands)
            
            # Execute the warmup script
            logger.debug("Running sandbox warmup imports...")
            proc = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: sandbox.exec("python", "-c", warmup_script, timeout=30)
            )
            
            # Check if warmup was successful
            if hasattr(proc, 'stdout') and hasattr(proc.stdout, 'read'):
                output = proc.stdout.read()
                if "Core packages warmed up successfully" in output:
                    logger.debug("Sandbox warmup imports completed successfully")
                else:
                    logger.warning(f"Sandbox warmup completed but output unexpected: {output}")
            else:
                logger.debug("Sandbox warmup imports completed")
                
        except Exception as e:
            logger.warning(f"Failed to warm up sandbox imports (sandbox still usable): {e}")
    async def _health_check_loop(self):
        """Background task to check sandbox health and perform proactive cleanup."""
        while self._running:
            try:
                # Perform regular health checks every interval
                await asyncio.sleep(self.health_check_interval)
                
                # First do a quick proactive cleanup
                cleaned = await self._proactive_cleanup()
                
                # Then do the full health check
                await self._perform_health_checks()
                
                # If we cleaned up sandboxes, trigger warmup
                if cleaned > 0:
                    logger.info(f"Health check cleaned {cleaned} sandboxes, pool may need warming")
                    
            except Exception as e:
                logger.error(f"Error in health check loop: {e}")
                await asyncio.sleep(10)  # Wait longer on error
    
    async def _perform_health_checks(self):
        """Perform health checks on sandboxes in the pool."""
        # This is a simplified health check - in practice you might want
        # to run a simple command to verify the sandbox is responsive
        temp_sandboxes = []
        
        # Drain the queue to check each sandbox
        while not self._sandbox_queue.empty():
            try:
                pooled_sb = self._sandbox_queue.get_nowait()
                is_healthy = await self._check_sandbox_health(pooled_sb.sandbox)
                pooled_sb.health = SandboxHealth.HEALTHY if is_healthy else SandboxHealth.UNHEALTHY
                if is_healthy:
                    temp_sandboxes.append(pooled_sb)
                else:
                    # TERMINATE unhealthy sandbox
                    await self._terminate_sandbox(pooled_sb.sandbox)
                    self._stats["recycled"] += 1
            except asyncio.QueueEmpty:
                break
        
        # Put healthy sandboxes back
        for pooled_sb in temp_sandboxes:
            try:
                self._sandbox_queue.put_nowait(pooled_sb)
            except asyncio.QueueFull:
                await self._terminate_sandbox(pooled_sb.sandbox)
        
        self._stats["health_checks"] += 1
        logger.debug(f"Health check completed. Pool size: {self._sandbox_queue.qsize()}")
    
    async def _check_sandbox_health(self, sandbox: modal.Sandbox) -> bool:
        """Check if a sandbox is healthy."""
        try:
            # Run a simple Python command to check if the sandbox is responsive
            proc = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: sandbox.exec("python", "-c", "print('health_check')", timeout=5)
            )
            output = proc.stdout.read()
            return "health_check" in output
        except Exception as e:
            logger.debug(f"Sandbox health check failed: {e}")
            return False
    
    async def _cleanup_loop(self):
        """Background task to cleanup old sandboxes."""
        while self._running:
            try:
                await asyncio.sleep(30)  # Check every 30 seconds
                await self._cleanup_old_sandboxes()
            except Exception as e:
                logger.error(f"Error in cleanup loop: {e}")
    
    async def _cleanup_old_sandboxes(self):
        """Remove old sandboxes from the pool."""
        now = time.time()
        temp_sandboxes = []
        
        while not self._sandbox_queue.empty():
            try:
                pooled_sb = self._sandbox_queue.get_nowait()
                if now - pooled_sb.created_at < self.max_age_seconds:
                    temp_sandboxes.append(pooled_sb)
                else:
                    # TERMINATE expired sandbox
                    await self._terminate_sandbox(pooled_sb.sandbox)
                    self._stats["recycled"] += 1
                    logger.debug("Cleaned up old sandbox")
            except asyncio.QueueEmpty:
                break
        
        # Put non-expired sandboxes back
        for pooled_sb in temp_sandboxes:
            try:
                self._sandbox_queue.put_nowait(pooled_sb)            
            except asyncio.QueueFull:
                await self._terminate_sandbox(pooled_sb.sandbox)

    async def _is_sandbox_alive(self, sandbox: modal.Sandbox) -> bool:
        """Check if a sandbox is alive by running a trivial command with better error handling."""
        try:
            # Check if sandbox was already marked as terminated
            if hasattr(sandbox, '_terminated') and sandbox._terminated:
                return False
                
            # Use a shorter timeout for liveness checks
            proc = await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(
                    None,
                    lambda: sandbox.exec("python", "-c", "print('ping')", timeout=3)
                ),
                timeout=5.0  # Overall timeout
            )
            
            if hasattr(proc, "stdout") and hasattr(proc.stdout, "read"):
                out = proc.stdout.read()
                return "ping" in out
            else:
                # For some Modal versions, output might be returned directly
                out = str(proc)
                return "ping" in out
                
        except asyncio.TimeoutError:
            logger.debug("Liveness check timed out - sandbox likely dead")
            return False
        except Exception as e:
            logger.debug(f"Liveness check failed: {e}")
            # Mark sandbox as dead to avoid repeated checks
            if hasattr(sandbox, '_terminated'):
                sandbox._terminated = True
            return False
    
    async def _emergency_pool_reset(self):
        """Emergency reset of the pool when too many consecutive failures occur."""
        logger.warning("Performing emergency pool reset due to consecutive failures")
        
        # Drain and terminate all sandboxes in the pool
        terminated_count = 0
        while not self._sandbox_queue.empty():
            try:
                pooled_sb = self._sandbox_queue.get_nowait()
                await self._terminate_sandbox(pooled_sb.sandbox)
                terminated_count += 1
            except asyncio.QueueEmpty:
                break
        
        logger.info(f"Emergency reset: terminated {terminated_count} sandboxes")
        
        # Reset failure counter
        self._consecutive_failures = 0
        
        # Try to create one fresh sandbox to test if the underlying issue is resolved
        try:
            test_sandbox = await self._create_sandbox()
            test_pooled = PooledSandbox(
                sandbox=test_sandbox,
                created_at=time.time(),
                last_used=time.time(),
                use_count=0
            )            
            self._sandbox_queue.put_nowait(test_pooled)
            logger.info("Emergency reset successful: created test sandbox")
        except Exception as e:
            logger.error(f"Emergency reset failed to create test sandbox: {e}")
            # Still reset the counter to allow retries
            pass

    def get_stats(self) -> Dict[str, Any]:
        """Get pool statistics including health metrics."""
        return {
            **self._stats,
            "pool_size": self._sandbox_queue.qsize(),
            "target_pool_size": self.pool_size,
            "running": self._running,
            "consecutive_failures": self._consecutive_failures,
            "last_successful_creation": self._last_successful_creation,
            "time_since_last_success": time.time() - self._last_successful_creation,
            "health_status": "healthy" if self._consecutive_failures < 3 else "degraded" if self._consecutive_failures < self._pool_reset_threshold else "critical"
        }

    async def _proactive_cleanup(self):
        """Proactively clean up dead or unhealthy sandboxes from the pool."""
        temp_sandboxes = []
        cleaned_count = 0
        
        # Drain the queue to check each sandbox
        while not self._sandbox_queue.empty():
            try:
                pooled_sb = self._sandbox_queue.get_nowait()
                
                # Quick health check
                if await self._is_sandbox_alive(pooled_sb.sandbox):
                    # Sandbox is alive - keep it
                    temp_sandboxes.append(pooled_sb)
                else:
                    # Sandbox is dead - terminate it
                    await self._terminate_sandbox(pooled_sb.sandbox)
                    cleaned_count += 1
                    logger.debug("Cleaned up dead sandbox during proactive cleanup")
                    
            except asyncio.QueueEmpty:
                break
        
        # Put healthy sandboxes back
        for pooled_sb in temp_sandboxes:
            try:
                self._sandbox_queue.put_nowait(pooled_sb)
            except asyncio.QueueFull:
                # Shouldn't happen, but terminate if it does
                await self._terminate_sandbox(pooled_sb.sandbox)
                cleaned_count += 1
        
        if cleaned_count > 0:
            logger.info(f"Proactive cleanup removed {cleaned_count} dead sandboxes")
            
        return cleaned_count

# Helper function for testing and debugging the sandbox pool
async def test_sandbox_pool_health(pool: WarmSandboxPool) -> Dict[str, Any]:
    """Test sandbox pool health and return detailed diagnostics."""
    diagnostics: Dict[str, Any] = {
        "timestamp": time.time(),
        "pool_stats": pool.get_stats(),
        "tests": {}
    }
    
    logger.info("Starting sandbox pool health test...")
    
    # Test 1: Pool basic stats
    stats = pool.get_stats()
    diagnostics["tests"]["pool_stats"] = {
        "passed": True,
        "details": stats
    }
    
    # Test 2: Try to get a sandbox
    try:
        async with pool.get_sandbox(timeout=10.0) as sandbox:
            # Test 3: Try to run a simple command
            try:
                proc = await asyncio.get_event_loop().run_in_executor(
                    None,
                    lambda: sandbox.exec("python", "-c", "print('health_test_ok')", timeout=5)
                )
                output = proc.stdout.read() if hasattr(proc.stdout, "read") else str(proc)
                
                diagnostics["tests"]["sandbox_execution"] = {
                    "passed": "health_test_ok" in output,
                    "output": output[:200],  # First 200 chars
                    "details": "Successfully executed test command"
                }
            except Exception as e:
                diagnostics["tests"]["sandbox_execution"] = {
                    "passed": False,
                    "error": str(e),
                    "details": "Failed to execute test command in sandbox"
                }
        
        diagnostics["tests"]["sandbox_acquisition"] = {
            "passed": True,
            "details": "Successfully acquired and released sandbox"
        }
        
    except Exception as e:
        diagnostics["tests"]["sandbox_acquisition"] = {
            "passed": False,
            "error": str(e),
            "details": "Failed to acquire sandbox from pool"
        }
        
        diagnostics["tests"]["sandbox_execution"] = {
            "passed": False,
            "error": "Could not test - no sandbox available",
            "details": "Skipped due to sandbox acquisition failure"
        }
    
    # Test 4: Check pool warmup status
    if pool._running:
        warmup_needed = pool.pool_size - stats["pool_size"]
        diagnostics["tests"]["pool_warmup"] = {
            "passed": warmup_needed <= 1,  # Allow 1 sandbox to be missing
            "details": f"Pool has {stats['pool_size']}/{pool.pool_size} sandboxes, {warmup_needed} needed"
        }
    else:
        diagnostics["tests"]["pool_warmup"] = {
            "passed": False,
            "details": "Pool is not running"
        }
    
    # Overall health assessment
    all_tests_passed = all(test.get("passed", False) for test in diagnostics["tests"].values())
    diagnostics["overall_health"] = "healthy" if all_tests_passed else "unhealthy"
    
    logger.info(f"Sandbox pool health test completed. Overall health: {diagnostics['overall_health']}")
    return diagnostics