Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,72 +1,106 @@
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import librosa
|
3 |
-
from transformers import
|
4 |
from gtts import gTTS
|
5 |
import gradio as gr
|
6 |
-
import spaces
|
7 |
from PIL import Image
|
8 |
import os
|
9 |
from langdetect import detect
|
10 |
import subprocess
|
11 |
|
|
|
|
|
12 |
# Install flash-attn
|
13 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
#
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
#
|
33 |
-
|
|
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
@spaces.GPU
|
38 |
def process_audio_input(audio):
|
|
|
|
|
|
|
39 |
try:
|
40 |
-
whisper_model.to('cuda')
|
41 |
audio, sr = librosa.load(audio, sr=16000)
|
42 |
-
input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(
|
43 |
predicted_ids = whisper_model.generate(input_features)
|
44 |
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
45 |
-
whisper_model.to('cpu')
|
46 |
return transcription
|
47 |
except Exception as e:
|
48 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
49 |
|
50 |
@spaces.GPU
|
51 |
def process_image_input(image, text_prompt):
|
|
|
|
|
|
|
52 |
try:
|
53 |
-
vision_model.to('cuda')
|
54 |
messages = [
|
55 |
{"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
|
56 |
]
|
57 |
prompt = vision_processor.tokenizer.apply_chat_template(
|
58 |
messages, tokenize=False, add_generation_prompt=True
|
59 |
)
|
60 |
-
inputs = vision_processor(prompt, image, return_tensors="pt").to(
|
61 |
generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
|
62 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
63 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
64 |
-
vision_model.to('cpu')
|
65 |
return response
|
66 |
except Exception as e:
|
67 |
return f"Error processing image: {str(e)}"
|
68 |
|
69 |
def generate_response(transcription):
|
|
|
|
|
|
|
70 |
try:
|
71 |
response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
|
72 |
return response
|
|
|
1 |
+
# Import spaces first to avoid CUDA initialization issues
|
2 |
+
import spaces
|
3 |
+
|
4 |
+
# Then import other libraries
|
5 |
import torch
|
6 |
import librosa
|
7 |
+
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
|
8 |
from gtts import gTTS
|
9 |
import gradio as gr
|
|
|
10 |
from PIL import Image
|
11 |
import os
|
12 |
from langdetect import detect
|
13 |
import subprocess
|
14 |
|
15 |
+
print("Using GPU for operations when available")
|
16 |
+
|
17 |
# Install flash-attn
|
18 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
19 |
|
20 |
+
# Function to safely load pipeline within a GPU-decorated function
|
21 |
+
@spaces.GPU
|
22 |
+
def load_pipeline(model_name, **kwargs):
|
23 |
+
try:
|
24 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
25 |
+
return pipeline(model=model_name, device=device, **kwargs)
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Error loading {model_name} pipeline: {e}")
|
28 |
+
return None
|
29 |
|
30 |
+
# Load Whisper model for speech recognition within a GPU-decorated function
|
31 |
+
@spaces.GPU
|
32 |
+
def load_whisper():
|
33 |
+
try:
|
34 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
35 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
36 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
37 |
+
return processor, model
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error loading Whisper model: {e}")
|
40 |
+
return None, None
|
41 |
|
42 |
+
# Load vision model within a GPU-decorated function
|
43 |
+
@spaces.GPU
|
44 |
+
def load_vision_model():
|
45 |
+
try:
|
46 |
+
model_id = "microsoft/Phi-3.5-vision-instruct"
|
47 |
+
model = AutoModelForCausalLM.from_pretrained(
|
48 |
+
model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
|
49 |
+
)
|
50 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
|
51 |
+
return model, processor
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error loading vision model: {e}")
|
54 |
+
return None, None
|
55 |
|
56 |
+
# Load sarvam-2b for text generation within a GPU-decorated function
|
57 |
+
@spaces.GPU
|
58 |
+
def load_sarvam():
|
59 |
+
return load_pipeline('sarvamai/sarvam-2b-v0.5')
|
60 |
|
61 |
+
# Load all models
|
62 |
+
whisper_processor, whisper_model = load_whisper()
|
63 |
+
vision_model, vision_processor = load_vision_model()
|
64 |
+
sarvam_pipe = load_sarvam()
|
65 |
|
66 |
@spaces.GPU
|
67 |
def process_audio_input(audio):
|
68 |
+
if whisper_processor is None or whisper_model is None:
|
69 |
+
return "Error: Speech recognition model is not available. Please type your message instead."
|
70 |
+
|
71 |
try:
|
|
|
72 |
audio, sr = librosa.load(audio, sr=16000)
|
73 |
+
input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
|
74 |
predicted_ids = whisper_model.generate(input_features)
|
75 |
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
|
|
76 |
return transcription
|
77 |
except Exception as e:
|
78 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
79 |
|
80 |
@spaces.GPU
|
81 |
def process_image_input(image, text_prompt):
|
82 |
+
if vision_model is None or vision_processor is None:
|
83 |
+
return "Error: Vision model is not available."
|
84 |
+
|
85 |
try:
|
|
|
86 |
messages = [
|
87 |
{"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
|
88 |
]
|
89 |
prompt = vision_processor.tokenizer.apply_chat_template(
|
90 |
messages, tokenize=False, add_generation_prompt=True
|
91 |
)
|
92 |
+
inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
|
93 |
generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
|
94 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
95 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
96 |
return response
|
97 |
except Exception as e:
|
98 |
return f"Error processing image: {str(e)}"
|
99 |
|
100 |
def generate_response(transcription):
|
101 |
+
if sarvam_pipe is None:
|
102 |
+
return "Error: Text generation model is not available."
|
103 |
+
|
104 |
try:
|
105 |
response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
|
106 |
return response
|