Spaces:

mknolan
/

cursor_slides_internvl2

Paused

File size: 4,912 Bytes

02532a9

#!/usr/bin/env python3
"""
GPU Diagnostics Tool for Hugging Face Spaces
This script performs a comprehensive check of GPU availability and functionality.
"""

import os
import sys
import subprocess
import time
import json

print("=" * 80)
print("GPU DIAGNOSTICS TOOL")
print("=" * 80)

# Check Python version
print(f"Python version: {sys.version}")
print("-" * 80)

# Check environment variables
print("ENVIRONMENT VARIABLES:")
gpu_related_vars = [
    "CUDA_VISIBLE_DEVICES",
    "NVIDIA_VISIBLE_DEVICES",
    "PYTORCH_CUDA_ALLOC_CONF",
    "HF_HOME"
]

for var in gpu_related_vars:
    print(f"{var}: {os.environ.get(var, 'Not set')}")
print("-" * 80)

# Check for nvidia-smi
print("CHECKING FOR NVIDIA-SMI:")
try:
    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode == 0:
        print("nvidia-smi is available and working!")
        print(result.stdout)
    else:
        print("nvidia-smi error:")
        print(result.stderr)
except Exception as e:
    print(f"Error running nvidia-smi: {str(e)}")
print("-" * 80)

# Check PyTorch and CUDA
print("CHECKING PYTORCH AND CUDA:")
try:
    import torch

    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
    
    if torch.cuda.is_available():
        print(f"CUDA device count: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
        print(f"Current CUDA device: {torch.cuda.current_device()}")
    
    # Try to create and operate on a CUDA tensor
    print("\nTesting CUDA tensor creation:")
    try:
        start_time = time.time()
        x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
        y = x @ x  # Matrix multiplication to test computation
        torch.cuda.synchronize()  # Wait for the operation to complete
        end_time = time.time()
        
        if torch.cuda.is_available():
            print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
        else:
            print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
    except Exception as e:
        print(f"Error in tensor creation/operation: {str(e)}")
    
    # Try to get more detailed CUDA info
    if torch.cuda.is_available():
        print("\nDetailed CUDA information:")
        print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
except ImportError:
    print("PyTorch is not installed")
print("-" * 80)

# Create a simple GPU test with a web interface
print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
try:
    import gradio as gr

    def check_gpu():
        results = {
            "python_version": sys.version,
            "environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
            "torch_available": False,
            "cuda_available": False
        }
        
        try:
            import torch
            results["torch_available"] = True
            results["torch_version"] = torch.__version__
            results["cuda_available"] = torch.cuda.is_available()
            
            if torch.cuda.is_available():
                results["cuda_version"] = torch.version.cuda
                results["cuda_device_count"] = torch.cuda.device_count()
                results["cuda_device_name"] = torch.cuda.get_device_name(0)
                
                # Test tensor creation
                start_time = time.time()
                x = torch.rand(1000, 1000, device="cuda")
                y = x @ x
                torch.cuda.synchronize()
                end_time = time.time()
                results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
                results["gpu_test_passed"] = True
            else:
                results["gpu_test_passed"] = False
        except Exception as e:
            results["error"] = str(e)
            results["gpu_test_passed"] = False
            
        return json.dumps(results, indent=2)

    demo = gr.Interface(
        fn=check_gpu,
        inputs=[],
        outputs="text",
        title="GPU Diagnostics",
        description="Click the button to run GPU diagnostics"
    )
    
    print("Starting Gradio web interface on port 7860...")
    demo.launch(server_name="0.0.0.0")
except ImportError:
    print("Gradio not installed, skipping web interface")
    print("Raw GPU diagnostics complete.")
print("-" * 80)