mknolan commited on
Commit
de08500
·
verified ·
1 Parent(s): 961e80c

Upload debug_model_loading.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. debug_model_loading.py +108 -0
debug_model_loading.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import sys
4
+ import traceback
5
+ import requests
6
+ import json
7
+ import platform
8
+
9
+ print("=" * 50)
10
+ print("DETAILED MODEL LOADING DIAGNOSTIC")
11
+ print("=" * 50)
12
+
13
+ # System information
14
+ print("\n1. SYSTEM INFORMATION:")
15
+ print(f"Python version: {sys.version}")
16
+ print(f"PyTorch version: {torch.__version__}")
17
+ print(f"Platform: {platform.platform()}")
18
+ print(f"Processor: {platform.processor()}")
19
+
20
+ # Environment variables
21
+ print("\n2. ENVIRONMENT VARIABLES:")
22
+ relevant_vars = ["CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "TRANSFORMERS_CACHE", "HF_HOME"]
23
+ for var in relevant_vars:
24
+ print(f"{var}: {os.environ.get(var, 'Not set')}")
25
+
26
+ # GPU information
27
+ print("\n3. GPU DETECTION:")
28
+ print(f"CUDA available: {torch.cuda.is_available()}")
29
+ if torch.cuda.is_available():
30
+ try:
31
+ print(f"CUDA version: {torch.version.cuda}")
32
+ print(f"GPU count: {torch.cuda.device_count()}")
33
+ for i in range(torch.cuda.device_count()):
34
+ print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
35
+
36
+ # Test GPU with a simple operation
37
+ print("\nTesting GPU with tensor operations...")
38
+ test_tensor = torch.rand(1000, 1000, device="cuda")
39
+ start = torch.cuda.Event(enable_timing=True)
40
+ end = torch.cuda.Event(enable_timing=True)
41
+
42
+ start.record()
43
+ result = torch.matmul(test_tensor, test_tensor)
44
+ end.record()
45
+
46
+ torch.cuda.synchronize()
47
+ print(f"GPU tensor operation completed in {start.elapsed_time(end):.2f} ms")
48
+
49
+ # Memory info
50
+ print(f"\nTotal GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
51
+ print(f"Allocated GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
52
+ print(f"Reserved GPU memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
53
+
54
+ except Exception as e:
55
+ print(f"Error testing GPU: {str(e)}")
56
+ traceback.print_exc()
57
+ else:
58
+ print("CUDA is not available. This is a critical issue for model loading.")
59
+
60
+ # HuggingFace hub connectivity
61
+ print("\n4. HUGGINGFACE HUB CONNECTIVITY:")
62
+ try:
63
+ print("Testing connection to HuggingFace Hub...")
64
+ response = requests.get("https://huggingface.co/api/models/OpenGVLab/InternViT-6B-224px")
65
+ if response.status_code == 200:
66
+ print("Successfully connected to HuggingFace Hub")
67
+ model_info = response.json()
68
+ print(f"Model exists: OpenGVLab/InternViT-6B-224px")
69
+ if 'downloads' in model_info:
70
+ print(f"Downloads: {model_info['downloads']}")
71
+ else:
72
+ print(f"Failed to connect to HuggingFace Hub: Status code {response.status_code}")
73
+ print(response.text)
74
+ except Exception as e:
75
+ print(f"Error connecting to HuggingFace Hub: {str(e)}")
76
+ traceback.print_exc()
77
+
78
+ # Attempt model loading with detailed error capture
79
+ print("\n5. ATTEMPTING MODEL LOADING:")
80
+ try:
81
+ print("Importing transformers...")
82
+ from transformers import AutoModel, AutoProcessor
83
+ print("✓ Transformers imported successfully")
84
+
85
+ print("\nLoading AutoProcessor...")
86
+ processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
87
+ print("✓ AutoProcessor loaded successfully")
88
+
89
+ print("\nLoading AutoModel...")
90
+ model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
91
+ print("✓ AutoModel loaded successfully")
92
+
93
+ if torch.cuda.is_available():
94
+ print("\nMoving model to CUDA...")
95
+ model = model.to("cuda")
96
+ print("✓ Model moved to CUDA successfully")
97
+
98
+ print("\nModel loading SUCCESSFUL")
99
+ print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
100
+
101
+ except Exception as e:
102
+ print(f"\n❌ ERROR LOADING MODEL: {str(e)}")
103
+ print("\nDetailed traceback:")
104
+ traceback.print_exc()
105
+
106
+ print("\n" + "=" * 50)
107
+ print("DIAGNOSTIC COMPLETE")
108
+ print("=" * 50)