Upload InternVL2 implementation
Browse files- Dockerfile +2 -0
- app_internvl2.py +31 -6
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -60,6 +60,8 @@ RUN pip3 install --no-cache-dir --upgrade pip && \
|
|
60 |
pip3 install --no-cache-dir transformers==4.37.2 safetensors==0.4.1 huggingface_hub==0.19.4 && \
|
61 |
# Install timm for vision models
|
62 |
pip3 install --no-cache-dir timm==0.9.11 && \
|
|
|
|
|
63 |
# Install lmdeploy and its dependencies first
|
64 |
pip3 install --no-cache-dir "accelerate==0.30.0" && \
|
65 |
pip3 install --no-cache-dir "lmdeploy==0.5.3" && \
|
|
|
60 |
pip3 install --no-cache-dir transformers==4.37.2 safetensors==0.4.1 huggingface_hub==0.19.4 && \
|
61 |
# Install timm for vision models
|
62 |
pip3 install --no-cache-dir timm==0.9.11 && \
|
63 |
+
# Install nest-asyncio for handling nested event loops
|
64 |
+
pip3 install --no-cache-dir nest-asyncio==1.5.8 && \
|
65 |
# Install lmdeploy and its dependencies first
|
66 |
pip3 install --no-cache-dir "accelerate==0.30.0" && \
|
67 |
pip3 install --no-cache-dir "lmdeploy==0.5.3" && \
|
app_internvl2.py
CHANGED
@@ -8,6 +8,11 @@ import warnings
|
|
8 |
import stat
|
9 |
import subprocess
|
10 |
import sys
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Set environment variables
|
13 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
@@ -143,11 +148,14 @@ def load_internvl2_model():
|
|
143 |
# Configure for AWQ quantized model
|
144 |
backend_config = TurbomindEngineConfig(model_format='awq')
|
145 |
|
146 |
-
# Create pipeline
|
147 |
internvl2_pipeline = pipeline(
|
148 |
MODEL_ID,
|
149 |
-
backend_config=backend_config,
|
150 |
-
log_level='INFO'
|
|
|
|
|
|
|
151 |
)
|
152 |
|
153 |
print("InternVL2 model loaded successfully!")
|
@@ -189,11 +197,28 @@ def analyze_image(image, prompt):
|
|
189 |
# If somehow it's already a PIL Image
|
190 |
image_pil = image.convert('RGB')
|
191 |
|
192 |
-
# Run inference with the model
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
# Get the response text
|
196 |
-
result = response.text
|
197 |
|
198 |
elapsed_time = time.time() - start_time
|
199 |
return result
|
|
|
8 |
import stat
|
9 |
import subprocess
|
10 |
import sys
|
11 |
+
import asyncio
|
12 |
+
import nest_asyncio
|
13 |
+
|
14 |
+
# Apply nest_asyncio to allow nested event loops
|
15 |
+
nest_asyncio.apply()
|
16 |
|
17 |
# Set environment variables
|
18 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
|
|
148 |
# Configure for AWQ quantized model
|
149 |
backend_config = TurbomindEngineConfig(model_format='awq')
|
150 |
|
151 |
+
# Create pipeline with non-streaming mode to avoid asyncio conflicts
|
152 |
internvl2_pipeline = pipeline(
|
153 |
MODEL_ID,
|
154 |
+
backend_config=backend_config,
|
155 |
+
log_level='INFO',
|
156 |
+
model_name_or_path=None,
|
157 |
+
backend_name="turbomind",
|
158 |
+
stream=False # Important: disable streaming to avoid asyncio issues
|
159 |
)
|
160 |
|
161 |
print("InternVL2 model loaded successfully!")
|
|
|
197 |
# If somehow it's already a PIL Image
|
198 |
image_pil = image.convert('RGB')
|
199 |
|
200 |
+
# Run inference with the model, handling event loop manually
|
201 |
+
loop = asyncio.get_event_loop()
|
202 |
+
if loop.is_running():
|
203 |
+
# If we're in a running event loop (like Gradio's),
|
204 |
+
# we need to use run_in_executor for blocking operations
|
205 |
+
print("Using threaded execution for model inference")
|
206 |
+
# Define a function that will run in a separate thread
|
207 |
+
def run_inference():
|
208 |
+
return internvl2_pipeline((prompt, image_pil))
|
209 |
+
|
210 |
+
# Run the inference in a thread pool executor
|
211 |
+
response = loop.run_in_executor(None, run_inference)
|
212 |
+
# Wait for the result
|
213 |
+
if hasattr(response, "result"):
|
214 |
+
response = response.result()
|
215 |
+
else:
|
216 |
+
# Standard synchronous execution
|
217 |
+
print("Using standard execution for model inference")
|
218 |
+
response = internvl2_pipeline((prompt, image_pil))
|
219 |
|
220 |
# Get the response text
|
221 |
+
result = response.text if hasattr(response, "text") else str(response)
|
222 |
|
223 |
elapsed_time = time.time() - start_time
|
224 |
return result
|
requirements.txt
CHANGED
@@ -16,4 +16,5 @@ packaging==23.2
|
|
16 |
pyyaml==6.0.1
|
17 |
tqdm==4.66.1
|
18 |
typing-extensions==4.10.0
|
19 |
-
timm==0.9.11
|
|
|
|
16 |
pyyaml==6.0.1
|
17 |
tqdm==4.66.1
|
18 |
typing-extensions==4.10.0
|
19 |
+
timm==0.9.11
|
20 |
+
nest-asyncio==1.5.8
|