File size: 4,912 Bytes
02532a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/usr/bin/env python3
"""
GPU Diagnostics Tool for Hugging Face Spaces
This script performs a comprehensive check of GPU availability and functionality.
"""
import os
import sys
import subprocess
import time
import json
print("=" * 80)
print("GPU DIAGNOSTICS TOOL")
print("=" * 80)
# Check Python version
print(f"Python version: {sys.version}")
print("-" * 80)
# Check environment variables
print("ENVIRONMENT VARIABLES:")
gpu_related_vars = [
"CUDA_VISIBLE_DEVICES",
"NVIDIA_VISIBLE_DEVICES",
"PYTORCH_CUDA_ALLOC_CONF",
"HF_HOME"
]
for var in gpu_related_vars:
print(f"{var}: {os.environ.get(var, 'Not set')}")
print("-" * 80)
# Check for nvidia-smi
print("CHECKING FOR NVIDIA-SMI:")
try:
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode == 0:
print("nvidia-smi is available and working!")
print(result.stdout)
else:
print("nvidia-smi error:")
print(result.stderr)
except Exception as e:
print(f"Error running nvidia-smi: {str(e)}")
print("-" * 80)
# Check PyTorch and CUDA
print("CHECKING PYTORCH AND CUDA:")
try:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
if torch.cuda.is_available():
print(f"CUDA device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
# Try to create and operate on a CUDA tensor
print("\nTesting CUDA tensor creation:")
try:
start_time = time.time()
x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
y = x @ x # Matrix multiplication to test computation
torch.cuda.synchronize() # Wait for the operation to complete
end_time = time.time()
if torch.cuda.is_available():
print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
else:
print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
except Exception as e:
print(f"Error in tensor creation/operation: {str(e)}")
# Try to get more detailed CUDA info
if torch.cuda.is_available():
print("\nDetailed CUDA information:")
print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
except ImportError:
print("PyTorch is not installed")
print("-" * 80)
# Create a simple GPU test with a web interface
print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
try:
import gradio as gr
def check_gpu():
results = {
"python_version": sys.version,
"environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
"torch_available": False,
"cuda_available": False
}
try:
import torch
results["torch_available"] = True
results["torch_version"] = torch.__version__
results["cuda_available"] = torch.cuda.is_available()
if torch.cuda.is_available():
results["cuda_version"] = torch.version.cuda
results["cuda_device_count"] = torch.cuda.device_count()
results["cuda_device_name"] = torch.cuda.get_device_name(0)
# Test tensor creation
start_time = time.time()
x = torch.rand(1000, 1000, device="cuda")
y = x @ x
torch.cuda.synchronize()
end_time = time.time()
results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
results["gpu_test_passed"] = True
else:
results["gpu_test_passed"] = False
except Exception as e:
results["error"] = str(e)
results["gpu_test_passed"] = False
return json.dumps(results, indent=2)
demo = gr.Interface(
fn=check_gpu,
inputs=[],
outputs="text",
title="GPU Diagnostics",
description="Click the button to run GPU diagnostics"
)
print("Starting Gradio web interface on port 7860...")
demo.launch(server_name="0.0.0.0")
except ImportError:
print("Gradio not installed, skipping web interface")
print("Raw GPU diagnostics complete.")
print("-" * 80) |