mknolan's picture
Upload InternVL2 implementation
02532a9 verified
raw
history blame
4.91 kB
#!/usr/bin/env python3
"""
GPU Diagnostics Tool for Hugging Face Spaces
This script performs a comprehensive check of GPU availability and functionality.
"""
import os
import sys
import subprocess
import time
import json
print("=" * 80)
print("GPU DIAGNOSTICS TOOL")
print("=" * 80)
# Check Python version
print(f"Python version: {sys.version}")
print("-" * 80)
# Check environment variables
print("ENVIRONMENT VARIABLES:")
gpu_related_vars = [
"CUDA_VISIBLE_DEVICES",
"NVIDIA_VISIBLE_DEVICES",
"PYTORCH_CUDA_ALLOC_CONF",
"HF_HOME"
]
for var in gpu_related_vars:
print(f"{var}: {os.environ.get(var, 'Not set')}")
print("-" * 80)
# Check for nvidia-smi
print("CHECKING FOR NVIDIA-SMI:")
try:
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode == 0:
print("nvidia-smi is available and working!")
print(result.stdout)
else:
print("nvidia-smi error:")
print(result.stderr)
except Exception as e:
print(f"Error running nvidia-smi: {str(e)}")
print("-" * 80)
# Check PyTorch and CUDA
print("CHECKING PYTORCH AND CUDA:")
try:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
if torch.cuda.is_available():
print(f"CUDA device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
# Try to create and operate on a CUDA tensor
print("\nTesting CUDA tensor creation:")
try:
start_time = time.time()
x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
y = x @ x # Matrix multiplication to test computation
torch.cuda.synchronize() # Wait for the operation to complete
end_time = time.time()
if torch.cuda.is_available():
print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
else:
print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
except Exception as e:
print(f"Error in tensor creation/operation: {str(e)}")
# Try to get more detailed CUDA info
if torch.cuda.is_available():
print("\nDetailed CUDA information:")
print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
except ImportError:
print("PyTorch is not installed")
print("-" * 80)
# Create a simple GPU test with a web interface
print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
try:
import gradio as gr
def check_gpu():
results = {
"python_version": sys.version,
"environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
"torch_available": False,
"cuda_available": False
}
try:
import torch
results["torch_available"] = True
results["torch_version"] = torch.__version__
results["cuda_available"] = torch.cuda.is_available()
if torch.cuda.is_available():
results["cuda_version"] = torch.version.cuda
results["cuda_device_count"] = torch.cuda.device_count()
results["cuda_device_name"] = torch.cuda.get_device_name(0)
# Test tensor creation
start_time = time.time()
x = torch.rand(1000, 1000, device="cuda")
y = x @ x
torch.cuda.synchronize()
end_time = time.time()
results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
results["gpu_test_passed"] = True
else:
results["gpu_test_passed"] = False
except Exception as e:
results["error"] = str(e)
results["gpu_test_passed"] = False
return json.dumps(results, indent=2)
demo = gr.Interface(
fn=check_gpu,
inputs=[],
outputs="text",
title="GPU Diagnostics",
description="Click the button to run GPU diagnostics"
)
print("Starting Gradio web interface on port 7860...")
demo.launch(server_name="0.0.0.0")
except ImportError:
print("Gradio not installed, skipping web interface")
print("Raw GPU diagnostics complete.")
print("-" * 80)