Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import librosa
|
|
3 |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
|
4 |
from gtts import gTTS
|
5 |
import gradio as gr
|
6 |
-
import spaces
|
7 |
from PIL import Image
|
8 |
import subprocess
|
9 |
|
@@ -12,21 +11,19 @@ print("Using GPU for operations when available")
|
|
12 |
# Install flash-attn
|
13 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
14 |
|
15 |
-
# Function to safely load pipeline
|
16 |
-
@spaces.GPU
|
17 |
def load_pipeline(model_name, **kwargs):
|
18 |
try:
|
19 |
-
device =
|
20 |
return pipeline(model=model_name, device=device, **kwargs)
|
21 |
except Exception as e:
|
22 |
print(f"Error loading {model_name} pipeline: {e}")
|
23 |
return None
|
24 |
|
25 |
-
# Load Whisper model for speech recognition
|
26 |
-
@spaces.GPU
|
27 |
def load_whisper():
|
28 |
try:
|
29 |
-
device =
|
30 |
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
31 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
32 |
return processor, model
|
@@ -34,21 +31,19 @@ def load_whisper():
|
|
34 |
print(f"Error loading Whisper model: {e}")
|
35 |
return None, None
|
36 |
|
37 |
-
# Load sarvam-2b for text generation
|
38 |
-
@spaces.GPU
|
39 |
def load_sarvam():
|
40 |
return load_pipeline('sarvamai/sarvam-2b-v0.5')
|
41 |
|
42 |
# Load vision model
|
43 |
-
@spaces.GPU
|
44 |
def load_vision_model():
|
45 |
model_id = "microsoft/Phi-3.5-vision-instruct"
|
46 |
-
|
|
|
47 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
48 |
return model, processor
|
49 |
|
50 |
-
# Process audio input
|
51 |
-
@spaces.GPU
|
52 |
def process_audio_input(audio, whisper_processor, whisper_model):
|
53 |
if whisper_processor is None or whisper_model is None:
|
54 |
return "Error: Speech recognition model is not available. Please type your message instead."
|
@@ -62,8 +57,7 @@ def process_audio_input(audio, whisper_processor, whisper_model):
|
|
62 |
except Exception as e:
|
63 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
64 |
|
65 |
-
# Generate response
|
66 |
-
@spaces.GPU
|
67 |
def text_to_speech(text, lang='hi'):
|
68 |
try:
|
69 |
# Use a better TTS engine for Indic languages
|
@@ -83,7 +77,6 @@ def detect_language(text):
|
|
83 |
# Implement language detection logic here
|
84 |
return 'en' # Default to English for now
|
85 |
|
86 |
-
@spaces.GPU
|
87 |
def generate_response(transcription, sarvam_pipe):
|
88 |
if sarvam_pipe is None:
|
89 |
return "Error: Text generation model is not available."
|
@@ -95,12 +88,11 @@ def generate_response(transcription, sarvam_pipe):
|
|
95 |
except Exception as e:
|
96 |
return f"Error generating response: {str(e)}"
|
97 |
|
98 |
-
@spaces.GPU
|
99 |
def process_image(image, text_input, vision_model, vision_processor):
|
100 |
try:
|
101 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
102 |
image = Image.fromarray(image).convert("RGB")
|
103 |
-
inputs = vision_processor(prompt, image, return_tensors="pt").to(
|
104 |
generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
|
105 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
106 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
@@ -108,10 +100,9 @@ def process_image(image, text_input, vision_model, vision_processor):
|
|
108 |
except Exception as e:
|
109 |
return f"Error processing image: {str(e)}"
|
110 |
|
111 |
-
@spaces.GPU
|
112 |
def multimodal_assistant(input_type, audio_input, text_input, image_input):
|
113 |
try:
|
114 |
-
# Load models
|
115 |
whisper_processor, whisper_model = load_whisper()
|
116 |
sarvam_pipe = load_sarvam()
|
117 |
vision_model, vision_processor = load_vision_model()
|
|
|
3 |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
|
4 |
from gtts import gTTS
|
5 |
import gradio as gr
|
|
|
6 |
from PIL import Image
|
7 |
import subprocess
|
8 |
|
|
|
11 |
# Install flash-attn
|
12 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
13 |
|
14 |
+
# Function to safely load pipeline
|
|
|
15 |
def load_pipeline(model_name, **kwargs):
|
16 |
try:
|
17 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
return pipeline(model=model_name, device=device, **kwargs)
|
19 |
except Exception as e:
|
20 |
print(f"Error loading {model_name} pipeline: {e}")
|
21 |
return None
|
22 |
|
23 |
+
# Load Whisper model for speech recognition
|
|
|
24 |
def load_whisper():
|
25 |
try:
|
26 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
28 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
29 |
return processor, model
|
|
|
31 |
print(f"Error loading Whisper model: {e}")
|
32 |
return None, None
|
33 |
|
34 |
+
# Load sarvam-2b for text generation
|
|
|
35 |
def load_sarvam():
|
36 |
return load_pipeline('sarvamai/sarvam-2b-v0.5')
|
37 |
|
38 |
# Load vision model
|
|
|
39 |
def load_vision_model():
|
40 |
model_id = "microsoft/Phi-3.5-vision-instruct"
|
41 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
42 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").to(device).eval()
|
43 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
44 |
return model, processor
|
45 |
|
46 |
+
# Process audio input
|
|
|
47 |
def process_audio_input(audio, whisper_processor, whisper_model):
|
48 |
if whisper_processor is None or whisper_model is None:
|
49 |
return "Error: Speech recognition model is not available. Please type your message instead."
|
|
|
57 |
except Exception as e:
|
58 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
59 |
|
60 |
+
# Generate response
|
|
|
61 |
def text_to_speech(text, lang='hi'):
|
62 |
try:
|
63 |
# Use a better TTS engine for Indic languages
|
|
|
77 |
# Implement language detection logic here
|
78 |
return 'en' # Default to English for now
|
79 |
|
|
|
80 |
def generate_response(transcription, sarvam_pipe):
|
81 |
if sarvam_pipe is None:
|
82 |
return "Error: Text generation model is not available."
|
|
|
88 |
except Exception as e:
|
89 |
return f"Error generating response: {str(e)}"
|
90 |
|
|
|
91 |
def process_image(image, text_input, vision_model, vision_processor):
|
92 |
try:
|
93 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
94 |
image = Image.fromarray(image).convert("RGB")
|
95 |
+
inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
|
96 |
generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
|
97 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
98 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
100 |
except Exception as e:
|
101 |
return f"Error processing image: {str(e)}"
|
102 |
|
|
|
103 |
def multimodal_assistant(input_type, audio_input, text_input, image_input):
|
104 |
try:
|
105 |
+
# Load models
|
106 |
whisper_processor, whisper_model = load_whisper()
|
107 |
sarvam_pipe = load_sarvam()
|
108 |
vision_model, vision_processor = load_vision_model()
|