archit11 commited on
Commit
011a958
·
verified ·
1 Parent(s): 06c5535

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -31
app.py CHANGED
@@ -1,54 +1,70 @@
1
  import transformers
 
2
  import gradio as gr
3
- import librosa
4
  import torch
5
  import numpy as np
 
6
  import spaces
7
- from typing import Tuple
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  @spaces.GPU(duration=120)
10
- def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
11
  try:
12
- pipe = transformers.pipeline(
13
- model='sarvamai/shuka_v1',
14
- trust_remote_code=True,
15
- device=0,
16
- torch_dtype=torch.bfloat16
17
- )
18
- # Unpack the audio input
19
- audio, sr = audio_input
20
-
21
  # Ensure audio is float32
22
  if audio.dtype != np.float32:
23
  audio = audio.astype(np.float32)
24
-
25
- # Resample if necessary
26
- if sr != 16000:
27
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
28
-
29
- # Define conversation turns
30
- turns = [
31
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
32
- {'role': 'user', 'content': ''}
33
- ]
34
 
35
- # Run the pipeline with the audio and conversation turns
36
- output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)
37
 
38
- # Return the model's response
39
- return output
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
  return f"Error processing audio: {str(e)}"
43
 
 
44
  iface = gr.Interface(
45
  fn=transcribe_and_respond,
46
- inputs=gr.Audio(sources="microphone", type="numpy"),
47
- outputs="text",
48
- title="Live Transcription and Response",
49
  description="Speak into your microphone, and the model will respond naturally and informatively.",
50
- live=True # Enable live processing
51
  )
52
 
 
53
  if __name__ == "__main__":
54
- iface.launch()
 
1
  import transformers
2
+
3
  import gradio as gr
 
4
  import torch
5
  import numpy as np
6
+ from typing import Dict, List
7
  import spaces
8
+
9
+ # Constants
10
+ MODEL_NAME = 'sarvamai/shuka_v1'
11
+ SAMPLE_RATE = 16000
12
+ MAX_NEW_TOKENS = 256
13
+
14
+ # Load the ShukaPipeline
15
+ def load_pipeline():
16
+ model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
17
+ pipeline = transformers.pipeline(
18
+ "shuka-pipeline",
19
+ model=model,
20
+ torch_dtype=torch.float16,
21
+ device=0 if torch.cuda.is_available() else -1,
22
+ )
23
+ return pipeline
24
+
25
+ pipe = load_pipeline()
26
+
27
+ def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
28
+ return [
29
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
30
+ {'role': 'user', 'content': prompt}
31
+ ]
32
 
33
  @spaces.GPU(duration=120)
34
+ def transcribe_and_respond(audio: np.ndarray) -> str:
35
  try:
 
 
 
 
 
 
 
 
 
36
  # Ensure audio is float32
37
  if audio.dtype != np.float32:
38
  audio = audio.astype(np.float32)
 
 
 
 
 
 
 
 
 
 
39
 
 
 
40
 
 
 
41
 
42
+
43
+ # Create input for the pipeline
44
+ turns = create_conversation_turns("<|audio|>")
45
+ inputs = {
46
+ 'audio': audio,
47
+ 'turns': turns,
48
+ 'sampling_rate': SAMPLE_RATE
49
+ }
50
+
51
+ # Generate response
52
+ response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
53
+
54
+ return response
55
  except Exception as e:
56
  return f"Error processing audio: {str(e)}"
57
 
58
+ # Create the Gradio interface
59
  iface = gr.Interface(
60
  fn=transcribe_and_respond,
61
+ inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
62
+ outputs="text",
63
+ title="Live Voice Input for Transcription and Response",
64
  description="Speak into your microphone, and the model will respond naturally and informatively.",
65
+ live=True
66
  )
67
 
68
+ # Launch the app
69
  if __name__ == "__main__":
70
+ iface.launch()