Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,36 +1,34 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer,
|
4 |
from PIL import Image
|
5 |
import logging
|
6 |
import spaces
|
|
|
7 |
|
8 |
# Setup logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
11 |
class LLaVAPhiModel:
|
12 |
def __init__(self, model_id="sagar007/Lava_phi"):
|
13 |
-
self.device = "cuda"
|
14 |
self.model_id = model_id
|
15 |
logging.info("Initializing LLaVA-Phi model...")
|
16 |
|
17 |
-
# Initialize tokenizer
|
18 |
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
19 |
if self.tokenizer.pad_token is None:
|
20 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
21 |
|
22 |
try:
|
23 |
-
#
|
24 |
-
self.processor =
|
|
|
25 |
except Exception as e:
|
26 |
-
logging.
|
27 |
-
# Fallback to basic tokenizer if needed
|
28 |
self.processor = None
|
29 |
|
30 |
-
# Store conversation history
|
31 |
self.history = []
|
32 |
-
|
33 |
-
# Lazy loading of models - will be initialized in GPU context
|
34 |
self.model = None
|
35 |
self.clip = None
|
36 |
|
@@ -38,7 +36,7 @@ class LLaVAPhiModel:
|
|
38 |
def ensure_models_loaded(self):
|
39 |
"""Ensure models are loaded in GPU context"""
|
40 |
if self.model is None:
|
41 |
-
# Load main model
|
42 |
from transformers import BitsAndBytesConfig
|
43 |
quantization_config = BitsAndBytesConfig(
|
44 |
load_in_4bit=True,
|
@@ -47,34 +45,37 @@ class LLaVAPhiModel:
|
|
47 |
bnb_4bit_quant_type="nf4"
|
48 |
)
|
49 |
|
50 |
-
|
51 |
-
self.
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if self.clip is None:
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
@spaces.GPU
|
69 |
def process_image(self, image):
|
70 |
-
"""Process image through CLIP if available
|
71 |
try:
|
72 |
-
# Ensure models are loaded
|
73 |
self.ensure_models_loaded()
|
74 |
|
75 |
-
# If CLIP isn't available, return None
|
76 |
if self.clip is None or self.processor is None:
|
77 |
-
logging.warning("CLIP model or processor not available
|
78 |
return None
|
79 |
|
80 |
# Convert image to correct format
|
@@ -83,12 +84,18 @@ class LLaVAPhiModel:
|
|
83 |
elif isinstance(image, numpy.ndarray):
|
84 |
image = Image.fromarray(image)
|
85 |
|
|
|
|
|
|
|
|
|
86 |
with torch.no_grad():
|
87 |
try:
|
|
|
88 |
image_inputs = self.processor(images=image, return_tensors="pt")
|
89 |
image_features = self.clip.get_image_features(
|
90 |
pixel_values=image_inputs.pixel_values.to(self.device)
|
91 |
)
|
|
|
92 |
return image_features
|
93 |
except Exception as e:
|
94 |
logging.error(f"Error during image processing: {str(e)}")
|
@@ -97,10 +104,9 @@ class LLaVAPhiModel:
|
|
97 |
logging.error(f"Error in process_image: {str(e)}")
|
98 |
return None
|
99 |
|
100 |
-
@spaces.GPU(duration=120)
|
101 |
def generate_response(self, message, image=None):
|
102 |
try:
|
103 |
-
# Ensure models are loaded
|
104 |
self.ensure_models_loaded()
|
105 |
|
106 |
if image is not None:
|
@@ -176,6 +182,7 @@ class LLaVAPhiModel:
|
|
176 |
|
177 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
178 |
|
|
|
179 |
if "gpt:" in response:
|
180 |
response = response.split("gpt:")[-1].strip()
|
181 |
if "human:" in response:
|
@@ -190,7 +197,7 @@ class LLaVAPhiModel:
|
|
190 |
logging.error(f"Error generating response: {str(e)}")
|
191 |
logging.error(f"Full traceback:", exc_info=True)
|
192 |
return f"Error: {str(e)}"
|
193 |
-
|
194 |
def clear_history(self):
|
195 |
self.history = []
|
196 |
return None
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPProcessor, CLIPModel
|
4 |
from PIL import Image
|
5 |
import logging
|
6 |
import spaces
|
7 |
+
import numpy
|
8 |
|
9 |
# Setup logging
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
class LLaVAPhiModel:
|
13 |
def __init__(self, model_id="sagar007/Lava_phi"):
|
14 |
+
self.device = "cuda"
|
15 |
self.model_id = model_id
|
16 |
logging.info("Initializing LLaVA-Phi model...")
|
17 |
|
18 |
+
# Initialize tokenizer
|
19 |
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
20 |
if self.tokenizer.pad_token is None:
|
21 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
22 |
|
23 |
try:
|
24 |
+
# Use CLIPProcessor directly instead of AutoProcessor
|
25 |
+
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
26 |
+
logging.info("Successfully loaded CLIP processor")
|
27 |
except Exception as e:
|
28 |
+
logging.error(f"Failed to load CLIP processor: {str(e)}")
|
|
|
29 |
self.processor = None
|
30 |
|
|
|
31 |
self.history = []
|
|
|
|
|
32 |
self.model = None
|
33 |
self.clip = None
|
34 |
|
|
|
36 |
def ensure_models_loaded(self):
|
37 |
"""Ensure models are loaded in GPU context"""
|
38 |
if self.model is None:
|
39 |
+
# Load main model with updated quantization config
|
40 |
from transformers import BitsAndBytesConfig
|
41 |
quantization_config = BitsAndBytesConfig(
|
42 |
load_in_4bit=True,
|
|
|
45 |
bnb_4bit_quant_type="nf4"
|
46 |
)
|
47 |
|
48 |
+
try:
|
49 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
50 |
+
self.model_id,
|
51 |
+
quantization_config=quantization_config,
|
52 |
+
device_map="auto",
|
53 |
+
torch_dtype=torch.bfloat16,
|
54 |
+
trust_remote_code=True
|
55 |
+
)
|
56 |
+
self.model.config.pad_token_id = self.tokenizer.eos_token_id
|
57 |
+
logging.info("Successfully loaded main model")
|
58 |
+
except Exception as e:
|
59 |
+
logging.error(f"Failed to load main model: {str(e)}")
|
60 |
+
raise
|
61 |
|
62 |
if self.clip is None:
|
63 |
+
try:
|
64 |
+
# Use CLIPModel directly instead of AutoModel
|
65 |
+
self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
|
66 |
+
logging.info("Successfully loaded CLIP model")
|
67 |
+
except Exception as e:
|
68 |
+
logging.error(f"Failed to load CLIP model: {str(e)}")
|
69 |
+
self.clip = None
|
70 |
|
71 |
@spaces.GPU
|
72 |
def process_image(self, image):
|
73 |
+
"""Process image through CLIP if available"""
|
74 |
try:
|
|
|
75 |
self.ensure_models_loaded()
|
76 |
|
|
|
77 |
if self.clip is None or self.processor is None:
|
78 |
+
logging.warning("CLIP model or processor not available")
|
79 |
return None
|
80 |
|
81 |
# Convert image to correct format
|
|
|
84 |
elif isinstance(image, numpy.ndarray):
|
85 |
image = Image.fromarray(image)
|
86 |
|
87 |
+
# Ensure image is in RGB mode
|
88 |
+
if image.mode != 'RGB':
|
89 |
+
image = image.convert('RGB')
|
90 |
+
|
91 |
with torch.no_grad():
|
92 |
try:
|
93 |
+
# Process image with error handling
|
94 |
image_inputs = self.processor(images=image, return_tensors="pt")
|
95 |
image_features = self.clip.get_image_features(
|
96 |
pixel_values=image_inputs.pixel_values.to(self.device)
|
97 |
)
|
98 |
+
logging.info("Successfully processed image through CLIP")
|
99 |
return image_features
|
100 |
except Exception as e:
|
101 |
logging.error(f"Error during image processing: {str(e)}")
|
|
|
104 |
logging.error(f"Error in process_image: {str(e)}")
|
105 |
return None
|
106 |
|
107 |
+
@spaces.GPU(duration=120)
|
108 |
def generate_response(self, message, image=None):
|
109 |
try:
|
|
|
110 |
self.ensure_models_loaded()
|
111 |
|
112 |
if image is not None:
|
|
|
182 |
|
183 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
184 |
|
185 |
+
# Clean up response
|
186 |
if "gpt:" in response:
|
187 |
response = response.split("gpt:")[-1].strip()
|
188 |
if "human:" in response:
|
|
|
197 |
logging.error(f"Error generating response: {str(e)}")
|
198 |
logging.error(f"Full traceback:", exc_info=True)
|
199 |
return f"Error: {str(e)}"
|
200 |
+
|
201 |
def clear_history(self):
|
202 |
self.history = []
|
203 |
return None
|