mknolan commited on
Commit
e6a585d
·
verified ·
1 Parent(s): 3f24cf2

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +202 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ import requests
4
+ from io import BytesIO
5
+ import gradio as gr
6
+ import os
7
+ import sys
8
+ import time
9
+ import warnings
10
+
11
+ # Suppress warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ print("Starting InternVL2 with Llama3-76B initialization...")
15
+ print(f"Python version: {sys.version}")
16
+ print(f"PyTorch version: {torch.__version__}")
17
+ print(f"CUDA available: {torch.cuda.is_available()}")
18
+
19
+ # Set up environment for CUDA
20
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
21
+
22
+ # Check GPU availability
23
+ def check_gpu():
24
+ if not torch.cuda.is_available():
25
+ print("CUDA is not available. This application requires GPU acceleration.")
26
+ return False
27
+
28
+ try:
29
+ # Test GPU with a simple operation
30
+ test_tensor = torch.rand(10, device="cuda")
31
+ _ = test_tensor + test_tensor
32
+ print(f"GPU is available: {torch.cuda.get_device_name(0)}")
33
+ return True
34
+ except Exception as e:
35
+ print(f"Error initializing GPU: {str(e)}")
36
+ return False
37
+
38
+ # Global flag for GPU availability
39
+ USE_GPU = check_gpu()
40
+
41
+ # Import InternVL modules
42
+ try:
43
+ from transformers import AutoModel, AutoProcessor
44
+ HAS_TRANSFORMERS = True
45
+ print("Successfully imported transformers")
46
+ except ImportError as e:
47
+ print(f"Error importing transformers: {str(e)}")
48
+ HAS_TRANSFORMERS = False
49
+
50
+ # Initialize models
51
+ internvit_model = None
52
+ llama_model = None
53
+ processor = None
54
+
55
+ def load_models():
56
+ global internvit_model, llama_model, processor
57
+
58
+ if not USE_GPU:
59
+ print("Cannot load models without GPU")
60
+ return False
61
+
62
+ try:
63
+ print("Loading InternViT-6B model for visual feature extraction...")
64
+
65
+ # Following the GitHub repo instructions for using InternViT-6B
66
+ processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
67
+ internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
68
+
69
+ if USE_GPU:
70
+ internvit_model = internvit_model.to("cuda")
71
+
72
+ print("InternViT-6B model loaded successfully!")
73
+
74
+ # For demonstration purposes, we'll just extract visual features for now
75
+ # In a real implementation, we would load Llama3-76B here
76
+ print("Note: Llama3-76B model loading is commented out for this demonstration")
77
+ # llama_model = ...
78
+
79
+ return True
80
+ except Exception as e:
81
+ print(f"Error loading models: {str(e)}")
82
+ return False
83
+
84
+ # Load models on startup
85
+ MODELS_LOADED = load_models()
86
+
87
+ def process_image(image_path, sample_url=None):
88
+ """Process an image using InternViT-6B for feature extraction"""
89
+
90
+ # Load image
91
+ if sample_url and not image_path:
92
+ # Load from URL if provided and no image uploaded
93
+ response = requests.get(sample_url)
94
+ image = Image.open(BytesIO(response.content))
95
+ print(f"Loaded sample image from URL: {sample_url}")
96
+ else:
97
+ # Use uploaded image
98
+ if isinstance(image_path, str):
99
+ image = Image.open(image_path)
100
+ else:
101
+ image = image_path
102
+
103
+ if not image:
104
+ return "No image provided"
105
+
106
+ if not MODELS_LOADED:
107
+ return "Models failed to load. Please check the logs."
108
+
109
+ try:
110
+ # Start timing
111
+ start_time = time.time()
112
+
113
+ # Process image through the visual encoder
114
+ print("Processing image through InternViT-6B...")
115
+ inputs = processor(images=image, return_tensors="pt")
116
+ if USE_GPU:
117
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
118
+
119
+ with torch.no_grad():
120
+ outputs = internvit_model(**inputs)
121
+
122
+ # Extract image features
123
+ image_features = outputs.last_hidden_state
124
+ pooled_output = outputs.pooler_output
125
+
126
+ # In a real implementation, we would pass these features to Llama3-76B
127
+ # For now, we'll just return info about the extracted features
128
+ feature_info = f"""
129
+ Image successfully processed through InternViT-6B:
130
+ - Last hidden state shape: {image_features.shape}
131
+ - Pooled output shape: {pooled_output.shape}
132
+
133
+ In a complete implementation, these visual features would be passed to Llama3-76B
134
+ for generating text responses about the image.
135
+
136
+ Note: This is a demonstration of visual feature extraction only.
137
+ """
138
+
139
+ # Calculate elapsed time
140
+ elapsed = time.time() - start_time
141
+
142
+ return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds."
143
+
144
+ except Exception as e:
145
+ return f"Error processing image: {str(e)}"
146
+
147
+ # Set up Gradio interface
148
+ def create_interface():
149
+ with gr.Blocks(title="InternVL2 with Llama3-76B") as demo:
150
+ gr.Markdown("# InternVL2 Visual Feature Extraction Demo")
151
+ gr.Markdown("## Using InternViT-6B for visual feature extraction")
152
+
153
+ # System status
154
+ status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load"
155
+ gr.Markdown(f"### System Status: {status}")
156
+
157
+ with gr.Row():
158
+ with gr.Column():
159
+ input_image = gr.Image(type="pil", label="Upload Image")
160
+ sample_btn = gr.Button("Use Sample Image")
161
+
162
+ with gr.Column():
163
+ output_text = gr.Textbox(label="Results", lines=10)
164
+
165
+ # Process button
166
+ process_btn = gr.Button("Extract Visual Features")
167
+ process_btn.click(
168
+ fn=process_image,
169
+ inputs=[input_image],
170
+ outputs=output_text
171
+ )
172
+
173
+ # Sample image button logic
174
+ sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg"
175
+
176
+ def use_sample():
177
+ return process_image(None, sample_image_url)
178
+
179
+ sample_btn.click(
180
+ fn=use_sample,
181
+ inputs=[],
182
+ outputs=output_text
183
+ )
184
+
185
+ # Add some explanation
186
+ gr.Markdown("""
187
+ ## About This Demo
188
+
189
+ This demonstration shows how to use InternViT-6B for visual feature extraction,
190
+ following the instructions from the OpenGVLab/InternVL GitHub repository.
191
+
192
+ The application extracts visual features from the input image that would typically
193
+ be passed to a language model like Llama3-76B. In a complete implementation,
194
+ these features would be used to generate text responses about the image.
195
+ """)
196
+
197
+ return demo
198
+
199
+ # Main function
200
+ if __name__ == "__main__":
201
+ demo = create_interface()
202
+ demo.launch(share=False, server_name="0.0.0.0")