|
|
|
""" |
|
GPU Diagnostics Tool for Hugging Face Spaces |
|
This script performs a comprehensive check of GPU availability and functionality. |
|
""" |
|
|
|
import os |
|
import sys |
|
import subprocess |
|
import time |
|
import json |
|
|
|
print("=" * 80) |
|
print("GPU DIAGNOSTICS TOOL") |
|
print("=" * 80) |
|
|
|
|
|
print(f"Python version: {sys.version}") |
|
print("-" * 80) |
|
|
|
|
|
print("ENVIRONMENT VARIABLES:") |
|
gpu_related_vars = [ |
|
"CUDA_VISIBLE_DEVICES", |
|
"NVIDIA_VISIBLE_DEVICES", |
|
"PYTORCH_CUDA_ALLOC_CONF", |
|
"HF_HOME" |
|
] |
|
|
|
for var in gpu_related_vars: |
|
print(f"{var}: {os.environ.get(var, 'Not set')}") |
|
print("-" * 80) |
|
|
|
|
|
print("CHECKING FOR NVIDIA-SMI:") |
|
try: |
|
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
|
if result.returncode == 0: |
|
print("nvidia-smi is available and working!") |
|
print(result.stdout) |
|
else: |
|
print("nvidia-smi error:") |
|
print(result.stderr) |
|
except Exception as e: |
|
print(f"Error running nvidia-smi: {str(e)}") |
|
print("-" * 80) |
|
|
|
|
|
print("CHECKING PYTORCH AND CUDA:") |
|
try: |
|
import torch |
|
|
|
print(f"PyTorch version: {torch.__version__}") |
|
print(f"CUDA available: {torch.cuda.is_available()}") |
|
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}") |
|
|
|
if torch.cuda.is_available(): |
|
print(f"CUDA device count: {torch.cuda.device_count()}") |
|
for i in range(torch.cuda.device_count()): |
|
print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}") |
|
print(f"Current CUDA device: {torch.cuda.current_device()}") |
|
|
|
|
|
print("\nTesting CUDA tensor creation:") |
|
try: |
|
start_time = time.time() |
|
x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu") |
|
y = x @ x |
|
torch.cuda.synchronize() |
|
end_time = time.time() |
|
|
|
if torch.cuda.is_available(): |
|
print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds") |
|
else: |
|
print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)") |
|
except Exception as e: |
|
print(f"Error in tensor creation/operation: {str(e)}") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
print("\nDetailed CUDA information:") |
|
print(f"CUDA capability: {torch.cuda.get_device_capability(0)}") |
|
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") |
|
print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}") |
|
except ImportError: |
|
print("PyTorch is not installed") |
|
print("-" * 80) |
|
|
|
|
|
print("CREATING SIMPLE GPU TEST WEB INTERFACE...") |
|
try: |
|
import gradio as gr |
|
|
|
def check_gpu(): |
|
results = { |
|
"python_version": sys.version, |
|
"environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars}, |
|
"torch_available": False, |
|
"cuda_available": False |
|
} |
|
|
|
try: |
|
import torch |
|
results["torch_available"] = True |
|
results["torch_version"] = torch.__version__ |
|
results["cuda_available"] = torch.cuda.is_available() |
|
|
|
if torch.cuda.is_available(): |
|
results["cuda_version"] = torch.version.cuda |
|
results["cuda_device_count"] = torch.cuda.device_count() |
|
results["cuda_device_name"] = torch.cuda.get_device_name(0) |
|
|
|
|
|
start_time = time.time() |
|
x = torch.rand(1000, 1000, device="cuda") |
|
y = x @ x |
|
torch.cuda.synchronize() |
|
end_time = time.time() |
|
results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds" |
|
results["gpu_test_passed"] = True |
|
else: |
|
results["gpu_test_passed"] = False |
|
except Exception as e: |
|
results["error"] = str(e) |
|
results["gpu_test_passed"] = False |
|
|
|
return json.dumps(results, indent=2) |
|
|
|
demo = gr.Interface( |
|
fn=check_gpu, |
|
inputs=[], |
|
outputs="text", |
|
title="GPU Diagnostics", |
|
description="Click the button to run GPU diagnostics" |
|
) |
|
|
|
print("Starting Gradio web interface on port 7860...") |
|
demo.launch(server_name="0.0.0.0") |
|
except ImportError: |
|
print("Gradio not installed, skipping web interface") |
|
print("Raw GPU diagnostics complete.") |
|
print("-" * 80) |