Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -69,14 +69,23 @@ def load_sarvam():
|
|
69 |
@spaces.GPU
|
70 |
def load_vision_model():
|
71 |
try:
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
except Exception as e:
|
78 |
-
print(f"
|
79 |
-
return None, None
|
80 |
|
81 |
|
82 |
# Process audio input within a GPU-decorated function
|
@@ -96,33 +105,28 @@ def process_audio_input(audio, whisper_processor, whisper_model):
|
|
96 |
|
97 |
# Updated process_image_input function
|
98 |
@spaces.GPU
|
99 |
-
def process_image_input(image, vision_model,
|
100 |
-
if vision_model is None or
|
101 |
return "Error: Vision model is not available."
|
102 |
|
103 |
try:
|
104 |
# Process the image
|
105 |
image = Image.open(io.BytesIO(image)).convert('RGB')
|
106 |
-
|
107 |
-
|
108 |
-
# Create a prompt
|
109 |
-
prompt = "Describe this image in detail:\n"
|
110 |
-
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
|
111 |
|
112 |
# Generate text
|
113 |
with torch.no_grad():
|
114 |
outputs = vision_model.generate(
|
115 |
-
|
116 |
max_new_tokens=100,
|
117 |
do_sample=True,
|
118 |
top_k=50,
|
119 |
top_p=0.95,
|
120 |
-
num_return_sequences=1
|
121 |
-
image_features=image_features
|
122 |
)
|
123 |
|
124 |
-
generated_text =
|
125 |
-
return generated_text
|
126 |
except Exception as e:
|
127 |
return f"Error processing image: {str(e)}"
|
128 |
# Generate response within a GPU-decorated function
|
@@ -181,14 +185,14 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
181 |
try:
|
182 |
whisper_processor, whisper_model = load_whisper()
|
183 |
sarvam_pipe = load_sarvam()
|
184 |
-
vision_model,
|
185 |
|
186 |
if input_type == "audio" and audio_input is not None:
|
187 |
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
188 |
elif input_type == "text" and text_input:
|
189 |
transcription = text_input
|
190 |
elif input_type == "image" and image_input is not None:
|
191 |
-
transcription = process_image_input(image_input, vision_model,
|
192 |
else:
|
193 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
194 |
|
@@ -201,7 +205,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
201 |
error_message = f"An error occurred: {str(e)}"
|
202 |
return error_message, error_message, None
|
203 |
|
204 |
-
|
205 |
# Custom CSS
|
206 |
custom_css = """
|
207 |
body {
|
|
|
69 |
@spaces.GPU
|
70 |
def load_vision_model():
|
71 |
try:
|
72 |
+
print("Starting to load vision model...")
|
73 |
+
model_id = "microsoft/Phi-3.5-vision-instruct"
|
74 |
+
print(f"Loading model from {model_id}")
|
75 |
+
model = AutoModelForCausalLM.from_pretrained(
|
76 |
+
model_id,
|
77 |
+
trust_remote_code=True,
|
78 |
+
torch_dtype=torch.float16,
|
79 |
+
use_flash_attention_2=False
|
80 |
+
)
|
81 |
+
print("Model loaded successfully")
|
82 |
+
print("Loading processor...")
|
83 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
|
84 |
+
print("Processor loaded successfully")
|
85 |
+
return model, processor
|
86 |
except Exception as e:
|
87 |
+
print(f"Detailed error in loading vision model: {str(e)}")
|
88 |
+
return None, None
|
89 |
|
90 |
|
91 |
# Process audio input within a GPU-decorated function
|
|
|
105 |
|
106 |
# Updated process_image_input function
|
107 |
@spaces.GPU
|
108 |
+
def process_image_input(image, vision_model, processor):
|
109 |
+
if vision_model is None or processor is None:
|
110 |
return "Error: Vision model is not available."
|
111 |
|
112 |
try:
|
113 |
# Process the image
|
114 |
image = Image.open(io.BytesIO(image)).convert('RGB')
|
115 |
+
inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
|
|
|
|
|
|
|
|
|
116 |
|
117 |
# Generate text
|
118 |
with torch.no_grad():
|
119 |
outputs = vision_model.generate(
|
120 |
+
**inputs,
|
121 |
max_new_tokens=100,
|
122 |
do_sample=True,
|
123 |
top_k=50,
|
124 |
top_p=0.95,
|
125 |
+
num_return_sequences=1
|
|
|
126 |
)
|
127 |
|
128 |
+
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
129 |
+
return generated_text
|
130 |
except Exception as e:
|
131 |
return f"Error processing image: {str(e)}"
|
132 |
# Generate response within a GPU-decorated function
|
|
|
185 |
try:
|
186 |
whisper_processor, whisper_model = load_whisper()
|
187 |
sarvam_pipe = load_sarvam()
|
188 |
+
vision_model, processor = load_vision_model()
|
189 |
|
190 |
if input_type == "audio" and audio_input is not None:
|
191 |
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
192 |
elif input_type == "text" and text_input:
|
193 |
transcription = text_input
|
194 |
elif input_type == "image" and image_input is not None:
|
195 |
+
transcription = process_image_input(image_input, vision_model, processor)
|
196 |
else:
|
197 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
198 |
|
|
|
205 |
error_message = f"An error occurred: {str(e)}"
|
206 |
return error_message, error_message, None
|
207 |
|
|
|
208 |
# Custom CSS
|
209 |
custom_css = """
|
210 |
body {
|