sagar007 commited on
Commit
0f965de
·
verified ·
1 Parent(s): f073c65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -25
app.py CHANGED
@@ -1,72 +1,106 @@
 
 
 
 
1
  import torch
2
  import librosa
3
- from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
  from gtts import gTTS
5
  import gradio as gr
6
- import spaces
7
  from PIL import Image
8
  import os
9
  from langdetect import detect
10
  import subprocess
11
 
 
 
12
  # Install flash-attn
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
- print("Loading models...")
 
 
 
 
 
 
 
 
16
 
17
- # Vision model
18
- vision_model_id = "microsoft/Phi-3.5-vision-instruct"
19
- vision_model = AutoModelForCausalLM.from_pretrained(
20
- vision_model_id,
21
- trust_remote_code=True,
22
- torch_dtype=torch.float16,
23
- use_flash_attention_2=False
24
- )
25
- vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16)
 
 
26
 
27
- # Whisper model
28
- whisper_model_id = "openai/whisper-small"
29
- whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id)
30
- whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id)
 
 
 
 
 
 
 
 
 
31
 
32
- # Sarvam model
33
- sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5')
 
 
34
 
35
- print("All models loaded successfully")
 
 
 
36
 
37
  @spaces.GPU
38
  def process_audio_input(audio):
 
 
 
39
  try:
40
- whisper_model.to('cuda')
41
  audio, sr = librosa.load(audio, sr=16000)
42
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda')
43
  predicted_ids = whisper_model.generate(input_features)
44
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
45
- whisper_model.to('cpu')
46
  return transcription
47
  except Exception as e:
48
  return f"Error processing audio: {str(e)}. Please type your message instead."
49
 
50
  @spaces.GPU
51
  def process_image_input(image, text_prompt):
 
 
 
52
  try:
53
- vision_model.to('cuda')
54
  messages = [
55
  {"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
56
  ]
57
  prompt = vision_processor.tokenizer.apply_chat_template(
58
  messages, tokenize=False, add_generation_prompt=True
59
  )
60
- inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
61
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
62
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
63
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
64
- vision_model.to('cpu')
65
  return response
66
  except Exception as e:
67
  return f"Error processing image: {str(e)}"
68
 
69
  def generate_response(transcription):
 
 
 
70
  try:
71
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
72
  return response
 
1
+ # Import spaces first to avoid CUDA initialization issues
2
+ import spaces
3
+
4
+ # Then import other libraries
5
  import torch
6
  import librosa
7
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
8
  from gtts import gTTS
9
  import gradio as gr
 
10
  from PIL import Image
11
  import os
12
  from langdetect import detect
13
  import subprocess
14
 
15
+ print("Using GPU for operations when available")
16
+
17
  # Install flash-attn
18
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
19
 
20
+ # Function to safely load pipeline within a GPU-decorated function
21
+ @spaces.GPU
22
+ def load_pipeline(model_name, **kwargs):
23
+ try:
24
+ device = 0 if torch.cuda.is_available() else "cpu"
25
+ return pipeline(model=model_name, device=device, **kwargs)
26
+ except Exception as e:
27
+ print(f"Error loading {model_name} pipeline: {e}")
28
+ return None
29
 
30
+ # Load Whisper model for speech recognition within a GPU-decorated function
31
+ @spaces.GPU
32
+ def load_whisper():
33
+ try:
34
+ device = 0 if torch.cuda.is_available() else "cpu"
35
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
36
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
37
+ return processor, model
38
+ except Exception as e:
39
+ print(f"Error loading Whisper model: {e}")
40
+ return None, None
41
 
42
+ # Load vision model within a GPU-decorated function
43
+ @spaces.GPU
44
+ def load_vision_model():
45
+ try:
46
+ model_id = "microsoft/Phi-3.5-vision-instruct"
47
+ model = AutoModelForCausalLM.from_pretrained(
48
+ model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
49
+ )
50
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
51
+ return model, processor
52
+ except Exception as e:
53
+ print(f"Error loading vision model: {e}")
54
+ return None, None
55
 
56
+ # Load sarvam-2b for text generation within a GPU-decorated function
57
+ @spaces.GPU
58
+ def load_sarvam():
59
+ return load_pipeline('sarvamai/sarvam-2b-v0.5')
60
 
61
+ # Load all models
62
+ whisper_processor, whisper_model = load_whisper()
63
+ vision_model, vision_processor = load_vision_model()
64
+ sarvam_pipe = load_sarvam()
65
 
66
  @spaces.GPU
67
  def process_audio_input(audio):
68
+ if whisper_processor is None or whisper_model is None:
69
+ return "Error: Speech recognition model is not available. Please type your message instead."
70
+
71
  try:
 
72
  audio, sr = librosa.load(audio, sr=16000)
73
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
74
  predicted_ids = whisper_model.generate(input_features)
75
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
76
  return transcription
77
  except Exception as e:
78
  return f"Error processing audio: {str(e)}. Please type your message instead."
79
 
80
  @spaces.GPU
81
  def process_image_input(image, text_prompt):
82
+ if vision_model is None or vision_processor is None:
83
+ return "Error: Vision model is not available."
84
+
85
  try:
 
86
  messages = [
87
  {"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
88
  ]
89
  prompt = vision_processor.tokenizer.apply_chat_template(
90
  messages, tokenize=False, add_generation_prompt=True
91
  )
92
+ inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
93
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
94
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
95
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
96
  return response
97
  except Exception as e:
98
  return f"Error processing image: {str(e)}"
99
 
100
  def generate_response(transcription):
101
+ if sarvam_pipe is None:
102
+ return "Error: Text generation model is not available."
103
+
104
  try:
105
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
106
  return response