Devakumar868 commited on
Commit
0fbde27
Β·
verified Β·
1 Parent(s): 55c39a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -23
app.py CHANGED
@@ -2,29 +2,28 @@ import os
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
- from transformers import pipeline, AutoModel
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
  from dia.model import Dia
9
  from dac.utils import load_model as load_dac_model
10
 
11
- # 1. Retrieve HF token and set device mapping
12
  HF_TOKEN = os.environ["HF_TOKEN"]
13
- device_map = "auto" # auto-shard models across 4Γ—L4 GPUs
14
 
15
- print("Loading RVQ Codec...")
16
  rvq = load_dac_model(tag="latest", model_type="44khz")
17
  rvq.eval()
18
- if torch.cuda.is_available():
19
- rvq = rvq.to("cuda")
20
 
21
- print("Loading VAD pipeline...")
22
  vad_pipe = PyannotePipeline.from_pretrained(
23
  "pyannote/voice-activity-detection",
24
  use_auth_token=HF_TOKEN
25
  )
26
 
27
- print("Loading Ultravox pipeline...")
28
  ultravox_pipe = pipeline(
29
  model="fixie-ai/ultravox-v0_4",
30
  trust_remote_code=True,
@@ -32,13 +31,13 @@ ultravox_pipe = pipeline(
32
  torch_dtype=torch.float16
33
  )
34
 
35
- print("Loading Audio Diffusion model...")
36
  diff_pipe = DiffusionPipeline.from_pretrained(
37
  "teticio/audio-diffusion-instrumental-hiphop-256",
38
  torch_dtype=torch.float16
39
  ).to("cuda")
40
 
41
- print("Loading Dia TTS (sharded across GPUs)...")
42
  dia = Dia.from_pretrained(
43
  "nari-labs/Dia-1.6B",
44
  device_map=device_map,
@@ -46,37 +45,34 @@ dia = Dia.from_pretrained(
46
  trust_remote_code=True
47
  )
48
 
49
- print("All models loaded successfully!")
50
-
51
  def process_audio(audio):
52
  sr, array = audio
53
  array = array.numpy() if torch.is_tensor(array) else array
54
 
55
- # 1. Voice activity detection
56
- vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
57
 
58
- # 2. RVQ encode/decode
59
- x = torch.tensor(array).unsqueeze(0).to("cuda")
60
  codes = rvq.encode(x)
61
  decoded = rvq.decode(codes).squeeze().cpu().numpy()
62
 
63
- # 3. Ultravox ASR β†’ text
64
- out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
65
  text = out.get("text", "")
66
 
67
- # 4. Prosody diffusion
68
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
69
 
70
- # 5. Dia TTS synthesis
71
- tts = dia.generate(f"[emotion:neutral] {text}")
72
  tts_np = tts.squeeze().cpu().numpy()
73
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
74
 
75
  return (sr, tts_np), text
76
 
77
- # Gradio UI
78
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
79
- gr.Markdown("## Maya-AI Supernatural Conversational Agent")
80
  audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
81
  send_btn = gr.Button("Send")
82
  audio_out = gr.Audio(label="AI Response")
 
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
+ from transformers import pipeline
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
  from dia.model import Dia
9
  from dac.utils import load_model as load_dac_model
10
 
11
+ # Load HF token and configure multi-GPU sharding
12
  HF_TOKEN = os.environ["HF_TOKEN"]
13
+ device_map = "auto"
14
 
15
+ # 1. Descript Audio Codec (RVQ)
16
  rvq = load_dac_model(tag="latest", model_type="44khz")
17
  rvq.eval()
18
+ if torch.cuda.is_available(): rvq = rvq.to("cuda")
 
19
 
20
+ # 2. Voice Activity Detection via Pyannote
21
  vad_pipe = PyannotePipeline.from_pretrained(
22
  "pyannote/voice-activity-detection",
23
  use_auth_token=HF_TOKEN
24
  )
25
 
26
+ # 3. Ultravox ASR+LLM (generic pipeline)
27
  ultravox_pipe = pipeline(
28
  model="fixie-ai/ultravox-v0_4",
29
  trust_remote_code=True,
 
31
  torch_dtype=torch.float16
32
  )
33
 
34
+ # 4. Audio Diffusion (Diffusers loader)
35
  diff_pipe = DiffusionPipeline.from_pretrained(
36
  "teticio/audio-diffusion-instrumental-hiphop-256",
37
  torch_dtype=torch.float16
38
  ).to("cuda")
39
 
40
+ # 5. Dia TTS with device sharding
41
  dia = Dia.from_pretrained(
42
  "nari-labs/Dia-1.6B",
43
  device_map=device_map,
 
45
  trust_remote_code=True
46
  )
47
 
 
 
48
  def process_audio(audio):
49
  sr, array = audio
50
  array = array.numpy() if torch.is_tensor(array) else array
51
 
52
+ # VAD segmentation
53
+ _ = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
54
 
55
+ # RVQ encode/decode
56
+ x = torch.tensor(array).unsqueeze(0).to("cuda")
57
  codes = rvq.encode(x)
58
  decoded = rvq.decode(codes).squeeze().cpu().numpy()
59
 
60
+ # Ultravox: speech β†’ text
61
+ out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
62
  text = out.get("text", "")
63
 
64
+ # Diffusion-based prosody
65
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
66
 
67
+ # Dia TTS synthesis
68
+ tts = dia.generate(f"[emotion:neutral] {text}")
69
  tts_np = tts.squeeze().cpu().numpy()
70
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
71
 
72
  return (sr, tts_np), text
73
 
 
74
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
75
+ gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
76
  audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
77
  send_btn = gr.Button("Send")
78
  audio_out = gr.Audio(label="AI Response")