sagar007 commited on
Commit
7199d05
·
verified ·
1 Parent(s): 7e2d83a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -20
app.py CHANGED
@@ -3,7 +3,6 @@ import librosa
3
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
4
  from gtts import gTTS
5
  import gradio as gr
6
- import spaces
7
  from PIL import Image
8
  import subprocess
9
 
@@ -12,21 +11,19 @@ print("Using GPU for operations when available")
12
  # Install flash-attn
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
- # Function to safely load pipeline within a GPU-decorated function
16
- @spaces.GPU
17
  def load_pipeline(model_name, **kwargs):
18
  try:
19
- device = 0 if torch.cuda.is_available() else "cpu"
20
  return pipeline(model=model_name, device=device, **kwargs)
21
  except Exception as e:
22
  print(f"Error loading {model_name} pipeline: {e}")
23
  return None
24
 
25
- # Load Whisper model for speech recognition within a GPU-decorated function
26
- @spaces.GPU
27
  def load_whisper():
28
  try:
29
- device = 0 if torch.cuda.is_available() else "cpu"
30
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
31
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
32
  return processor, model
@@ -34,21 +31,19 @@ def load_whisper():
34
  print(f"Error loading Whisper model: {e}")
35
  return None, None
36
 
37
- # Load sarvam-2b for text generation within a GPU-decorated function
38
- @spaces.GPU
39
  def load_sarvam():
40
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
41
 
42
  # Load vision model
43
- @spaces.GPU
44
  def load_vision_model():
45
  model_id = "microsoft/Phi-3.5-vision-instruct"
46
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").cuda().eval()
 
47
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
48
  return model, processor
49
 
50
- # Process audio input within a GPU-decorated function
51
- @spaces.GPU
52
  def process_audio_input(audio, whisper_processor, whisper_model):
53
  if whisper_processor is None or whisper_model is None:
54
  return "Error: Speech recognition model is not available. Please type your message instead."
@@ -62,8 +57,7 @@ def process_audio_input(audio, whisper_processor, whisper_model):
62
  except Exception as e:
63
  return f"Error processing audio: {str(e)}. Please type your message instead."
64
 
65
- # Generate response within a GPU-decorated function
66
- @spaces.GPU
67
  def text_to_speech(text, lang='hi'):
68
  try:
69
  # Use a better TTS engine for Indic languages
@@ -83,7 +77,6 @@ def detect_language(text):
83
  # Implement language detection logic here
84
  return 'en' # Default to English for now
85
 
86
- @spaces.GPU
87
  def generate_response(transcription, sarvam_pipe):
88
  if sarvam_pipe is None:
89
  return "Error: Text generation model is not available."
@@ -95,12 +88,11 @@ def generate_response(transcription, sarvam_pipe):
95
  except Exception as e:
96
  return f"Error generating response: {str(e)}"
97
 
98
- @spaces.GPU
99
  def process_image(image, text_input, vision_model, vision_processor):
100
  try:
101
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
102
  image = Image.fromarray(image).convert("RGB")
103
- inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda:0")
104
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
105
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
106
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -108,10 +100,9 @@ def process_image(image, text_input, vision_model, vision_processor):
108
  except Exception as e:
109
  return f"Error processing image: {str(e)}"
110
 
111
- @spaces.GPU
112
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
113
  try:
114
- # Load models within the GPU-decorated function
115
  whisper_processor, whisper_model = load_whisper()
116
  sarvam_pipe = load_sarvam()
117
  vision_model, vision_processor = load_vision_model()
 
3
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
4
  from gtts import gTTS
5
  import gradio as gr
 
6
  from PIL import Image
7
  import subprocess
8
 
 
11
  # Install flash-attn
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
+ # Function to safely load pipeline
 
15
  def load_pipeline(model_name, **kwargs):
16
  try:
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
  return pipeline(model=model_name, device=device, **kwargs)
19
  except Exception as e:
20
  print(f"Error loading {model_name} pipeline: {e}")
21
  return None
22
 
23
+ # Load Whisper model for speech recognition
 
24
  def load_whisper():
25
  try:
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
28
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
29
  return processor, model
 
31
  print(f"Error loading Whisper model: {e}")
32
  return None, None
33
 
34
+ # Load sarvam-2b for text generation
 
35
  def load_sarvam():
36
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
37
 
38
  # Load vision model
 
39
  def load_vision_model():
40
  model_id = "microsoft/Phi-3.5-vision-instruct"
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").to(device).eval()
43
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
44
  return model, processor
45
 
46
+ # Process audio input
 
47
  def process_audio_input(audio, whisper_processor, whisper_model):
48
  if whisper_processor is None or whisper_model is None:
49
  return "Error: Speech recognition model is not available. Please type your message instead."
 
57
  except Exception as e:
58
  return f"Error processing audio: {str(e)}. Please type your message instead."
59
 
60
+ # Generate response
 
61
  def text_to_speech(text, lang='hi'):
62
  try:
63
  # Use a better TTS engine for Indic languages
 
77
  # Implement language detection logic here
78
  return 'en' # Default to English for now
79
 
 
80
  def generate_response(transcription, sarvam_pipe):
81
  if sarvam_pipe is None:
82
  return "Error: Text generation model is not available."
 
88
  except Exception as e:
89
  return f"Error generating response: {str(e)}"
90
 
 
91
  def process_image(image, text_input, vision_model, vision_processor):
92
  try:
93
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
94
  image = Image.fromarray(image).convert("RGB")
95
+ inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
96
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
97
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
98
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
100
  except Exception as e:
101
  return f"Error processing image: {str(e)}"
102
 
 
103
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
104
  try:
105
+ # Load models
106
  whisper_processor, whisper_model = load_whisper()
107
  sarvam_pipe = load_sarvam()
108
  vision_model, vision_processor = load_vision_model()