sagar007 commited on
Commit
2553fb1
·
verified ·
1 Parent(s): 7199d05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -6,51 +6,57 @@ import gradio as gr
6
  from PIL import Image
7
  import subprocess
8
 
 
 
 
 
9
  print("Using GPU for operations when available")
10
 
11
  # Install flash-attn
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
  # Function to safely load pipeline
 
15
  def load_pipeline(model_name, **kwargs):
16
  try:
17
- device = "cuda" if torch.cuda.is_available() else "cpu"
18
- return pipeline(model=model_name, device=device, **kwargs)
19
  except Exception as e:
20
  print(f"Error loading {model_name} pipeline: {e}")
21
  return None
22
 
23
  # Load Whisper model for speech recognition
 
24
  def load_whisper():
25
  try:
26
- device = "cuda" if torch.cuda.is_available() else "cpu"
27
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
28
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
29
  return processor, model
30
  except Exception as e:
31
  print(f"Error loading Whisper model: {e}")
32
  return None, None
33
 
34
  # Load sarvam-2b for text generation
 
35
  def load_sarvam():
36
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
37
 
38
  # Load vision model
 
39
  def load_vision_model():
40
  model_id = "microsoft/Phi-3.5-vision-instruct"
41
- device = "cuda" if torch.cuda.is_available() else "cpu"
42
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").to(device).eval()
43
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
44
  return model, processor
45
 
46
  # Process audio input
 
47
  def process_audio_input(audio, whisper_processor, whisper_model):
48
  if whisper_processor is None or whisper_model is None:
49
  return "Error: Speech recognition model is not available. Please type your message instead."
50
 
51
  try:
52
  audio, sr = librosa.load(audio, sr=16000)
53
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
54
  predicted_ids = whisper_model.generate(input_features)
55
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
56
  return transcription
@@ -77,6 +83,7 @@ def detect_language(text):
77
  # Implement language detection logic here
78
  return 'en' # Default to English for now
79
 
 
80
  def generate_response(transcription, sarvam_pipe):
81
  if sarvam_pipe is None:
82
  return "Error: Text generation model is not available."
@@ -88,11 +95,12 @@ def generate_response(transcription, sarvam_pipe):
88
  except Exception as e:
89
  return f"Error generating response: {str(e)}"
90
 
 
91
  def process_image(image, text_input, vision_model, vision_processor):
92
  try:
93
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
94
  image = Image.fromarray(image).convert("RGB")
95
- inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
96
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
97
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
98
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -100,6 +108,7 @@ def process_image(image, text_input, vision_model, vision_processor):
100
  except Exception as e:
101
  return f"Error processing image: {str(e)}"
102
 
 
103
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
104
  try:
105
  # Load models
 
6
  from PIL import Image
7
  import subprocess
8
 
9
+ # Import and initialize ZeroGPU
10
+ import spaces
11
+ spaces.init()
12
+
13
  print("Using GPU for operations when available")
14
 
15
  # Install flash-attn
16
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
17
 
18
  # Function to safely load pipeline
19
+ @spaces.GPU
20
  def load_pipeline(model_name, **kwargs):
21
  try:
22
+ return pipeline(model=model_name, device=0, **kwargs)
 
23
  except Exception as e:
24
  print(f"Error loading {model_name} pipeline: {e}")
25
  return None
26
 
27
  # Load Whisper model for speech recognition
28
+ @spaces.GPU
29
  def load_whisper():
30
  try:
 
31
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
32
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").cuda()
33
  return processor, model
34
  except Exception as e:
35
  print(f"Error loading Whisper model: {e}")
36
  return None, None
37
 
38
  # Load sarvam-2b for text generation
39
+ @spaces.GPU
40
  def load_sarvam():
41
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
42
 
43
  # Load vision model
44
+ @spaces.GPU
45
  def load_vision_model():
46
  model_id = "microsoft/Phi-3.5-vision-instruct"
47
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").cuda().eval()
 
48
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
49
  return model, processor
50
 
51
  # Process audio input
52
+ @spaces.GPU
53
  def process_audio_input(audio, whisper_processor, whisper_model):
54
  if whisper_processor is None or whisper_model is None:
55
  return "Error: Speech recognition model is not available. Please type your message instead."
56
 
57
  try:
58
  audio, sr = librosa.load(audio, sr=16000)
59
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.cuda()
60
  predicted_ids = whisper_model.generate(input_features)
61
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
62
  return transcription
 
83
  # Implement language detection logic here
84
  return 'en' # Default to English for now
85
 
86
+ @spaces.GPU
87
  def generate_response(transcription, sarvam_pipe):
88
  if sarvam_pipe is None:
89
  return "Error: Text generation model is not available."
 
95
  except Exception as e:
96
  return f"Error generating response: {str(e)}"
97
 
98
+ @spaces.GPU
99
  def process_image(image, text_input, vision_model, vision_processor):
100
  try:
101
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
102
  image = Image.fromarray(image).convert("RGB")
103
+ inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
104
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
105
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
106
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
108
  except Exception as e:
109
  return f"Error processing image: {str(e)}"
110
 
111
+ @spaces.GPU
112
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
113
  try:
114
  # Load models