File size: 4,912 Bytes
02532a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""
GPU Diagnostics Tool for Hugging Face Spaces
This script performs a comprehensive check of GPU availability and functionality.
"""

import os
import sys
import subprocess
import time
import json

print("=" * 80)
print("GPU DIAGNOSTICS TOOL")
print("=" * 80)

# Check Python version
print(f"Python version: {sys.version}")
print("-" * 80)

# Check environment variables
print("ENVIRONMENT VARIABLES:")
gpu_related_vars = [
    "CUDA_VISIBLE_DEVICES",
    "NVIDIA_VISIBLE_DEVICES",
    "PYTORCH_CUDA_ALLOC_CONF",
    "HF_HOME"
]

for var in gpu_related_vars:
    print(f"{var}: {os.environ.get(var, 'Not set')}")
print("-" * 80)

# Check for nvidia-smi
print("CHECKING FOR NVIDIA-SMI:")
try:
    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode == 0:
        print("nvidia-smi is available and working!")
        print(result.stdout)
    else:
        print("nvidia-smi error:")
        print(result.stderr)
except Exception as e:
    print(f"Error running nvidia-smi: {str(e)}")
print("-" * 80)

# Check PyTorch and CUDA
print("CHECKING PYTORCH AND CUDA:")
try:
    import torch

    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
    
    if torch.cuda.is_available():
        print(f"CUDA device count: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
        print(f"Current CUDA device: {torch.cuda.current_device()}")
    
    # Try to create and operate on a CUDA tensor
    print("\nTesting CUDA tensor creation:")
    try:
        start_time = time.time()
        x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
        y = x @ x  # Matrix multiplication to test computation
        torch.cuda.synchronize()  # Wait for the operation to complete
        end_time = time.time()
        
        if torch.cuda.is_available():
            print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
        else:
            print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
    except Exception as e:
        print(f"Error in tensor creation/operation: {str(e)}")
    
    # Try to get more detailed CUDA info
    if torch.cuda.is_available():
        print("\nDetailed CUDA information:")
        print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
except ImportError:
    print("PyTorch is not installed")
print("-" * 80)

# Create a simple GPU test with a web interface
print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
try:
    import gradio as gr

    def check_gpu():
        results = {
            "python_version": sys.version,
            "environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
            "torch_available": False,
            "cuda_available": False
        }
        
        try:
            import torch
            results["torch_available"] = True
            results["torch_version"] = torch.__version__
            results["cuda_available"] = torch.cuda.is_available()
            
            if torch.cuda.is_available():
                results["cuda_version"] = torch.version.cuda
                results["cuda_device_count"] = torch.cuda.device_count()
                results["cuda_device_name"] = torch.cuda.get_device_name(0)
                
                # Test tensor creation
                start_time = time.time()
                x = torch.rand(1000, 1000, device="cuda")
                y = x @ x
                torch.cuda.synchronize()
                end_time = time.time()
                results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
                results["gpu_test_passed"] = True
            else:
                results["gpu_test_passed"] = False
        except Exception as e:
            results["error"] = str(e)
            results["gpu_test_passed"] = False
            
        return json.dumps(results, indent=2)

    demo = gr.Interface(
        fn=check_gpu,
        inputs=[],
        outputs="text",
        title="GPU Diagnostics",
        description="Click the button to run GPU diagnostics"
    )
    
    print("Starting Gradio web interface on port 7860...")
    demo.launch(server_name="0.0.0.0")
except ImportError:
    print("Gradio not installed, skipping web interface")
    print("Raw GPU diagnostics complete.")
print("-" * 80)