Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -69,15 +69,15 @@ def load_sarvam():
|
|
69 |
@spaces.GPU
|
70 |
def load_vision_model():
|
71 |
try:
|
72 |
-
model_id = "microsoft/
|
73 |
-
model = AutoModelForCausalLM.from_pretrained(
|
74 |
-
|
75 |
-
)
|
76 |
-
|
77 |
-
return model, processor
|
78 |
except Exception as e:
|
79 |
print(f"Error loading vision model: {e}")
|
80 |
-
return None, None
|
|
|
81 |
|
82 |
# Process audio input within a GPU-decorated function
|
83 |
@spaces.GPU
|
@@ -94,24 +94,35 @@ def process_audio_input(audio, whisper_processor, whisper_model):
|
|
94 |
except Exception as e:
|
95 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
96 |
|
97 |
-
#
|
98 |
@spaces.GPU
|
99 |
-
def process_image_input(image, vision_model,
|
100 |
-
if vision_model is None or
|
101 |
return "Error: Vision model is not available."
|
102 |
|
103 |
try:
|
104 |
-
#
|
105 |
-
|
|
|
106 |
|
107 |
-
|
108 |
-
|
|
|
109 |
|
|
|
110 |
with torch.no_grad():
|
111 |
-
outputs = vision_model.generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
generated_text =
|
114 |
-
return generated_text
|
115 |
except Exception as e:
|
116 |
return f"Error processing image: {str(e)}"
|
117 |
# Generate response within a GPU-decorated function
|
@@ -168,17 +179,16 @@ def detect_language(text):
|
|
168 |
@spaces.GPU
|
169 |
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
170 |
try:
|
171 |
-
# Load models within the GPU-decorated function
|
172 |
whisper_processor, whisper_model = load_whisper()
|
173 |
sarvam_pipe = load_sarvam()
|
174 |
-
vision_model,
|
175 |
|
176 |
if input_type == "audio" and audio_input is not None:
|
177 |
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
178 |
elif input_type == "text" and text_input:
|
179 |
transcription = text_input
|
180 |
elif input_type == "image" and image_input is not None:
|
181 |
-
transcription = process_image_input(image_input, vision_model,
|
182 |
else:
|
183 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
184 |
|
@@ -191,6 +201,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
191 |
error_message = f"An error occurred: {str(e)}"
|
192 |
return error_message, error_message, None
|
193 |
|
|
|
194 |
# Custom CSS
|
195 |
custom_css = """
|
196 |
body {
|
|
|
69 |
@spaces.GPU
|
70 |
def load_vision_model():
|
71 |
try:
|
72 |
+
model_id = "microsoft/phi-2" # Changed to phi-2 as it's more widely available
|
73 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
|
74 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
75 |
+
image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
|
76 |
+
return model, tokenizer, image_processor
|
|
|
77 |
except Exception as e:
|
78 |
print(f"Error loading vision model: {e}")
|
79 |
+
return None, None, None
|
80 |
+
|
81 |
|
82 |
# Process audio input within a GPU-decorated function
|
83 |
@spaces.GPU
|
|
|
94 |
except Exception as e:
|
95 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
96 |
|
97 |
+
# Updated process_image_input function
|
98 |
@spaces.GPU
|
99 |
+
def process_image_input(image, vision_model, tokenizer, image_processor):
|
100 |
+
if vision_model is None or tokenizer is None or image_processor is None:
|
101 |
return "Error: Vision model is not available."
|
102 |
|
103 |
try:
|
104 |
+
# Process the image
|
105 |
+
image = Image.open(io.BytesIO(image)).convert('RGB')
|
106 |
+
image_features = image_processor(images=image, return_tensors="pt")["pixel_values"].to(vision_model.device)
|
107 |
|
108 |
+
# Create a prompt
|
109 |
+
prompt = "Describe this image in detail:\n"
|
110 |
+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
|
111 |
|
112 |
+
# Generate text
|
113 |
with torch.no_grad():
|
114 |
+
outputs = vision_model.generate(
|
115 |
+
input_ids,
|
116 |
+
max_new_tokens=100,
|
117 |
+
do_sample=True,
|
118 |
+
top_k=50,
|
119 |
+
top_p=0.95,
|
120 |
+
num_return_sequences=1,
|
121 |
+
image_features=image_features
|
122 |
+
)
|
123 |
|
124 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
125 |
+
return generated_text.replace(prompt, "") # Remove the prompt from the output
|
126 |
except Exception as e:
|
127 |
return f"Error processing image: {str(e)}"
|
128 |
# Generate response within a GPU-decorated function
|
|
|
179 |
@spaces.GPU
|
180 |
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
181 |
try:
|
|
|
182 |
whisper_processor, whisper_model = load_whisper()
|
183 |
sarvam_pipe = load_sarvam()
|
184 |
+
vision_model, tokenizer, image_processor = load_vision_model()
|
185 |
|
186 |
if input_type == "audio" and audio_input is not None:
|
187 |
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
188 |
elif input_type == "text" and text_input:
|
189 |
transcription = text_input
|
190 |
elif input_type == "image" and image_input is not None:
|
191 |
+
transcription = process_image_input(image_input, vision_model, tokenizer, image_processor)
|
192 |
else:
|
193 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
194 |
|
|
|
201 |
error_message = f"An error occurred: {str(e)}"
|
202 |
return error_message, error_message, None
|
203 |
|
204 |
+
|
205 |
# Custom CSS
|
206 |
custom_css = """
|
207 |
body {
|