Somnath3570 commited on
Commit
3e435ed
·
verified ·
1 Parent(s): f5d5522

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -52
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app.py
2
  import gradio as gr
3
  import torch
4
  import transformers
@@ -8,51 +7,42 @@ import os
8
 
9
  class UltravoxInterface:
10
  def __init__(self):
11
- """Initialize the Ultravox model and settings"""
12
- print("Loading Ultravox model... This may take a few minutes...")
 
 
 
13
  self.pipe = transformers.pipeline(
14
- model='fixie-ai/ultravox-v0_4',
15
- trust_remote_code=True,
 
16
  device=0 if torch.cuda.is_available() else -1
17
  )
18
- print("Model loaded successfully!")
19
 
20
- # Default system prompt
21
- self.default_prompt = "You are a friendly and helpful character. You love to answer questions for people."
22
 
23
  def process_audio(self, audio_path, custom_prompt=None):
24
- """Process audio input and return model response"""
25
  try:
26
- # Load and preprocess audio
27
- audio, sr = librosa.load(audio_path, sr=16000)
28
-
29
- # Prepare conversation turns
30
- turns = [
31
- {
32
- "role": "system",
33
- "content": custom_prompt if custom_prompt else self.default_prompt
34
- }
35
- ]
36
 
37
- # Get model response
38
- result = self.pipe(
39
- {
40
- 'audio': audio,
41
- 'turns': turns,
42
- 'sampling_rate': sr
43
- },
44
- max_new_tokens=30
45
- )
46
 
47
- # Handle different response formats
48
- if isinstance(result, str):
49
- return result
50
- elif isinstance(result, list):
51
- return result[0] if result else "No response generated"
52
- elif isinstance(result, dict):
53
- return result.get('generated_text', "No response generated")
54
- else:
55
- return str(result)
56
 
57
  except Exception as e:
58
  return f"Error processing audio: {str(e)}"
@@ -60,12 +50,12 @@ class UltravoxInterface:
60
  def create_interface(self):
61
  """Create and configure the Gradio interface"""
62
 
63
- with gr.Blocks(title="Ultravox Voice Assistant", theme=gr.themes.Soft(
64
  primary_hue="orange",
65
  secondary_hue="gray",
66
  )) as interface:
67
- gr.Markdown("# 🎙️ Ultravox Voice Assistant")
68
- gr.Markdown("Speak into the microphone and get AI-generated responses!")
69
 
70
  with gr.Row():
71
  with gr.Column():
@@ -75,12 +65,6 @@ class UltravoxInterface:
75
  type="filepath"
76
  )
77
 
78
- system_prompt = gr.Textbox(
79
- label="System Prompt (Optional)",
80
- placeholder="Enter custom system prompt or leave empty for default",
81
- value=self.default_prompt
82
- )
83
-
84
  submit_btn = gr.Button(
85
  "Process Audio",
86
  variant="primary"
@@ -88,26 +72,26 @@ class UltravoxInterface:
88
 
89
  with gr.Column():
90
  output_text = gr.Textbox(
91
- label="AI Response",
92
  lines=5,
93
- placeholder="AI response will appear here..."
94
  )
95
 
96
  submit_btn.click(
97
  fn=self.process_audio,
98
- inputs=[audio_input, system_prompt],
99
  outputs=output_text
100
  )
101
 
102
  gr.Markdown("""
103
  ## How to use:
104
  1. Click the microphone icon and allow browser access
105
- 2. Speak your question or prompt
106
  3. Click 'Stop' when finished
107
- 4. Click 'Process Audio' to get the AI response
108
 
109
  ## Note:
110
- First-time loading may take a few minutes as the model is downloaded.
111
  """)
112
 
113
  return interface
 
 
1
  import gradio as gr
2
  import torch
3
  import transformers
 
7
 
8
  class UltravoxInterface:
9
  def __init__(self):
10
+ """Initialize with smaller model footprint"""
11
+ print("Initializing voice interface...")
12
+
13
+ # Use smaller whisper model instead of full Ultravox
14
+ self.model_name = "openai/whisper-small"
15
  self.pipe = transformers.pipeline(
16
+ "automatic-speech-recognition",
17
+ model=self.model_name,
18
+ torch_dtype=torch.float16, # Use half precision
19
  device=0 if torch.cuda.is_available() else -1
20
  )
 
21
 
22
+ print("Model loaded successfully!")
 
23
 
24
  def process_audio(self, audio_path, custom_prompt=None):
25
+ """Process audio with optimized memory usage"""
26
  try:
27
+ if audio_path is None:
28
+ return "Please provide an audio input."
29
+
30
+ # Load audio in chunks to save memory
31
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
 
 
 
 
 
32
 
33
+ # Process audio in smaller segments if needed
34
+ max_length = 30 * sr # 30 seconds chunks
35
+ if len(audio) > max_length:
36
+ segments = []
37
+ for i in range(0, len(audio), max_length):
38
+ segment = audio[i:i + max_length]
39
+ result = self.pipe(segment, batch_size=1)
40
+ segments.append(result["text"])
41
+ return " ".join(segments)
42
 
43
+ # Process shorter audio directly
44
+ result = self.pipe(audio, batch_size=1)
45
+ return result["text"]
 
 
 
 
 
 
46
 
47
  except Exception as e:
48
  return f"Error processing audio: {str(e)}"
 
50
  def create_interface(self):
51
  """Create and configure the Gradio interface"""
52
 
53
+ with gr.Blocks(title="Voice Assistant", theme=gr.themes.Soft(
54
  primary_hue="orange",
55
  secondary_hue="gray",
56
  )) as interface:
57
+ gr.Markdown("# 🎙️ Voice Assistant")
58
+ gr.Markdown("Speak into the microphone and get text transcription!")
59
 
60
  with gr.Row():
61
  with gr.Column():
 
65
  type="filepath"
66
  )
67
 
 
 
 
 
 
 
68
  submit_btn = gr.Button(
69
  "Process Audio",
70
  variant="primary"
 
72
 
73
  with gr.Column():
74
  output_text = gr.Textbox(
75
+ label="Transcription",
76
  lines=5,
77
+ placeholder="Transcription will appear here..."
78
  )
79
 
80
  submit_btn.click(
81
  fn=self.process_audio,
82
+ inputs=[audio_input],
83
  outputs=output_text
84
  )
85
 
86
  gr.Markdown("""
87
  ## How to use:
88
  1. Click the microphone icon and allow browser access
89
+ 2. Speak your message
90
  3. Click 'Stop' when finished
91
+ 4. Click 'Process Audio' to get the transcription
92
 
93
  ## Note:
94
+ Optimized for short audio clips (up to 30 seconds).
95
  """)
96
 
97
  return interface