Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,71 +3,69 @@ import torch
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
|
4 |
from PIL import Image
|
5 |
import logging
|
|
|
6 |
|
7 |
# Setup logging
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
|
10 |
class LLaVAPhiModel:
|
11 |
def __init__(self, model_id="sagar007/Lava_phi"):
|
12 |
-
self.device =
|
13 |
-
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
self.model = AutoModelForCausalLM.from_pretrained(
|
41 |
-
model_id,
|
42 |
-
|
|
|
|
|
|
|
43 |
)
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
#
|
48 |
-
if self.tokenizer.pad_token is None:
|
49 |
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
50 |
-
self.model.config.pad_token_id = self.tokenizer.eos_token_id
|
51 |
-
|
52 |
-
# Load CLIP model and processor
|
53 |
-
logging.info("Loading CLIP model and processor...")
|
54 |
-
self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
55 |
self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
|
56 |
-
|
57 |
-
# Store conversation history
|
58 |
-
self.history = []
|
59 |
-
|
60 |
-
except Exception as e:
|
61 |
-
logging.error(f"Error initializing model: {str(e)}")
|
62 |
-
raise
|
63 |
|
|
|
64 |
def process_image(self, image):
|
65 |
"""Process image through CLIP"""
|
66 |
try:
|
67 |
-
# Ensure
|
68 |
-
|
|
|
|
|
|
|
69 |
image = Image.open(image)
|
70 |
-
elif isinstance(image, numpy.ndarray):
|
71 |
image = Image.fromarray(image)
|
72 |
|
73 |
with torch.no_grad():
|
@@ -79,12 +77,15 @@ class LLaVAPhiModel:
|
|
79 |
except Exception as e:
|
80 |
logging.error(f"Error processing image: {str(e)}")
|
81 |
raise
|
82 |
-
|
|
|
83 |
def generate_response(self, message, image=None):
|
84 |
try:
|
|
|
|
|
|
|
85 |
if image is not None:
|
86 |
try:
|
87 |
-
# Get image features
|
88 |
image_features = self.process_image(image)
|
89 |
has_image = True
|
90 |
except Exception as e:
|
@@ -93,17 +94,12 @@ class LLaVAPhiModel:
|
|
93 |
has_image = False
|
94 |
message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
|
95 |
|
96 |
-
# Format prompt
|
97 |
prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
|
98 |
-
|
99 |
-
# Add context from history
|
100 |
context = ""
|
101 |
for turn in self.history[-3:]:
|
102 |
context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
|
103 |
|
104 |
full_prompt = context + prompt
|
105 |
-
|
106 |
-
# Prepare text inputs
|
107 |
inputs = self.tokenizer(
|
108 |
full_prompt,
|
109 |
return_tensors="pt",
|
@@ -113,11 +109,9 @@ class LLaVAPhiModel:
|
|
113 |
)
|
114 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
115 |
|
116 |
-
# Add image features to inputs if available
|
117 |
if has_image:
|
118 |
inputs["image_features"] = image_features
|
119 |
|
120 |
-
# Generate response
|
121 |
with torch.no_grad():
|
122 |
outputs = self.model.generate(
|
123 |
**inputs,
|
@@ -134,7 +128,6 @@ class LLaVAPhiModel:
|
|
134 |
eos_token_id=self.tokenizer.eos_token_id
|
135 |
)
|
136 |
else:
|
137 |
-
# Text-only response
|
138 |
prompt = f"human: {message}\ngpt:"
|
139 |
context = ""
|
140 |
for turn in self.history[-3:]:
|
@@ -166,10 +159,8 @@ class LLaVAPhiModel:
|
|
166 |
eos_token_id=self.tokenizer.eos_token_id
|
167 |
)
|
168 |
|
169 |
-
# Decode response
|
170 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
171 |
|
172 |
-
# Clean up response
|
173 |
if "gpt:" in response:
|
174 |
response = response.split("gpt:")[-1].strip()
|
175 |
if "human:" in response:
|
@@ -177,9 +168,7 @@ class LLaVAPhiModel:
|
|
177 |
if "<image>" in response:
|
178 |
response = response.replace("<image>", "").strip()
|
179 |
|
180 |
-
# Update history
|
181 |
self.history.append((message, response))
|
182 |
-
|
183 |
return response
|
184 |
|
185 |
except Exception as e:
|
@@ -193,13 +182,12 @@ class LLaVAPhiModel:
|
|
193 |
|
194 |
def create_demo():
|
195 |
try:
|
196 |
-
# Initialize model
|
197 |
model = LLaVAPhiModel()
|
198 |
|
199 |
with gr.Blocks(css="footer {visibility: hidden}") as demo:
|
200 |
gr.Markdown(
|
201 |
"""
|
202 |
-
# LLaVA-Phi Demo
|
203 |
Chat with a vision-language model that can understand both text and images.
|
204 |
"""
|
205 |
)
|
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
|
4 |
from PIL import Image
|
5 |
import logging
|
6 |
+
import spaces
|
7 |
|
8 |
# Setup logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
11 |
class LLaVAPhiModel:
|
12 |
def __init__(self, model_id="sagar007/Lava_phi"):
|
13 |
+
self.device = "cuda" # Always use cuda with ZeroGPU
|
14 |
+
self.model_id = model_id
|
15 |
+
logging.info("Initializing LLaVA-Phi model...")
|
16 |
|
17 |
+
# Initialize tokenizer (can be done outside GPU context)
|
18 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
19 |
+
if self.tokenizer.pad_token is None:
|
20 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
21 |
+
|
22 |
+
# Initialize processor (can be done outside GPU context)
|
23 |
+
self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
24 |
+
|
25 |
+
# Store conversation history
|
26 |
+
self.history = []
|
27 |
+
|
28 |
+
# Lazy loading of models - will be initialized in GPU context
|
29 |
+
self.model = None
|
30 |
+
self.clip = None
|
31 |
+
|
32 |
+
@spaces.GPU
|
33 |
+
def ensure_models_loaded(self):
|
34 |
+
"""Ensure models are loaded in GPU context"""
|
35 |
+
if self.model is None:
|
36 |
+
# Load main model
|
37 |
+
from transformers import BitsAndBytesConfig
|
38 |
+
quantization_config = BitsAndBytesConfig(
|
39 |
+
load_in_4bit=True,
|
40 |
+
bnb_4bit_compute_dtype=torch.float16,
|
41 |
+
bnb_4bit_use_double_quant=True,
|
42 |
+
bnb_4bit_quant_type="nf4"
|
43 |
+
)
|
44 |
|
45 |
self.model = AutoModelForCausalLM.from_pretrained(
|
46 |
+
self.model_id,
|
47 |
+
quantization_config=quantization_config,
|
48 |
+
device_map="auto",
|
49 |
+
torch_dtype=torch.bfloat16,
|
50 |
+
trust_remote_code=True
|
51 |
)
|
52 |
+
self.model.config.pad_token_id = self.tokenizer.eos_token_id
|
53 |
+
|
54 |
+
if self.clip is None:
|
55 |
+
# Load CLIP model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
@spaces.GPU
|
59 |
def process_image(self, image):
|
60 |
"""Process image through CLIP"""
|
61 |
try:
|
62 |
+
# Ensure models are loaded
|
63 |
+
self.ensure_models_loaded()
|
64 |
+
|
65 |
+
# Convert image to correct format
|
66 |
+
if isinstance(image, str):
|
67 |
image = Image.open(image)
|
68 |
+
elif isinstance(image, numpy.ndarray):
|
69 |
image = Image.fromarray(image)
|
70 |
|
71 |
with torch.no_grad():
|
|
|
77 |
except Exception as e:
|
78 |
logging.error(f"Error processing image: {str(e)}")
|
79 |
raise
|
80 |
+
|
81 |
+
@spaces.GPU(duration=120) # Set longer duration for generation
|
82 |
def generate_response(self, message, image=None):
|
83 |
try:
|
84 |
+
# Ensure models are loaded
|
85 |
+
self.ensure_models_loaded()
|
86 |
+
|
87 |
if image is not None:
|
88 |
try:
|
|
|
89 |
image_features = self.process_image(image)
|
90 |
has_image = True
|
91 |
except Exception as e:
|
|
|
94 |
has_image = False
|
95 |
message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
|
96 |
|
|
|
97 |
prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
|
|
|
|
|
98 |
context = ""
|
99 |
for turn in self.history[-3:]:
|
100 |
context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
|
101 |
|
102 |
full_prompt = context + prompt
|
|
|
|
|
103 |
inputs = self.tokenizer(
|
104 |
full_prompt,
|
105 |
return_tensors="pt",
|
|
|
109 |
)
|
110 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
111 |
|
|
|
112 |
if has_image:
|
113 |
inputs["image_features"] = image_features
|
114 |
|
|
|
115 |
with torch.no_grad():
|
116 |
outputs = self.model.generate(
|
117 |
**inputs,
|
|
|
128 |
eos_token_id=self.tokenizer.eos_token_id
|
129 |
)
|
130 |
else:
|
|
|
131 |
prompt = f"human: {message}\ngpt:"
|
132 |
context = ""
|
133 |
for turn in self.history[-3:]:
|
|
|
159 |
eos_token_id=self.tokenizer.eos_token_id
|
160 |
)
|
161 |
|
|
|
162 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
163 |
|
|
|
164 |
if "gpt:" in response:
|
165 |
response = response.split("gpt:")[-1].strip()
|
166 |
if "human:" in response:
|
|
|
168 |
if "<image>" in response:
|
169 |
response = response.replace("<image>", "").strip()
|
170 |
|
|
|
171 |
self.history.append((message, response))
|
|
|
172 |
return response
|
173 |
|
174 |
except Exception as e:
|
|
|
182 |
|
183 |
def create_demo():
|
184 |
try:
|
|
|
185 |
model = LLaVAPhiModel()
|
186 |
|
187 |
with gr.Blocks(css="footer {visibility: hidden}") as demo:
|
188 |
gr.Markdown(
|
189 |
"""
|
190 |
+
# LLaVA-Phi Demo (ZeroGPU)
|
191 |
Chat with a vision-language model that can understand both text and images.
|
192 |
"""
|
193 |
)
|