Files changed (1) hide show
  1. app.py +22 -19
app.py CHANGED
@@ -28,13 +28,13 @@ description = """
28
  [[Project Page]](https://audio-agi.github.io/Separate-Anything-You-Describe) [[Paper]](https://audio-agi.github.io/Separate-Anything-You-Describe/AudioSep_arXiv.pdf) [[Code]](https://github.com/Audio-AGI/AudioSep)
29
 
30
  AudioSep is a foundation model for open-domain sound separation with natural language queries.
31
- AudioSep demonstrates strong separation performance and impressivezero-shot generalization ability on
32
  numerous tasks such as audio event separation, musical instrument separation, and speech enhancement.
33
  """
34
 
35
 
36
- def inference(audio_file_path: str, text: str):
37
- print(f"Separate audio from [{audio_file_path}] with textual query [{text}]")
38
  mixture, _ = librosa.load(audio_file_path, sr=32000, mono=True)
39
 
40
  with torch.no_grad():
@@ -49,9 +49,12 @@ def inference(audio_file_path: str, text: str):
49
  "condition": conditions,
50
  }
51
 
52
- sep_segment = MODEL.ss_model(input_dict)["waveform"]
53
-
54
- sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
 
 
 
55
 
56
  return 32000, np.round(sep_segment * 32767).astype(np.int16)
57
 
@@ -62,21 +65,21 @@ with gr.Blocks(title="AudioSep") as demo:
62
  with gr.Column():
63
  input_audio = gr.Audio(label="Mixture", type="filepath")
64
  text = gr.Textbox(label="Text Query")
 
65
  with gr.Column():
66
- with gr.Column():
67
- output_audio = gr.Audio(label="Separation Result", scale=10)
68
- button = gr.Button(
69
- "Separate",
70
- variant="primary",
71
- scale=2,
72
- size="lg",
73
- interactive=True,
74
- )
75
- button.click(
76
- fn=inference, inputs=[input_audio, text], outputs=[output_audio]
77
- )
78
 
79
  gr.Markdown("## Examples")
80
  gr.Examples(examples=EXAMPLES, inputs=[input_audio, text])
81
 
82
- demo.queue().launch(share=True)
 
28
  [[Project Page]](https://audio-agi.github.io/Separate-Anything-You-Describe) [[Paper]](https://audio-agi.github.io/Separate-Anything-You-Describe/AudioSep_arXiv.pdf) [[Code]](https://github.com/Audio-AGI/AudioSep)
29
 
30
  AudioSep is a foundation model for open-domain sound separation with natural language queries.
31
+ AudioSep demonstrates strong separation performance and impressive zero-shot generalization ability on
32
  numerous tasks such as audio event separation, musical instrument separation, and speech enhancement.
33
  """
34
 
35
 
36
+ def inference(audio_file_path: str, text: str, use_advanced: bool):
37
+ print(f"Separate audio from [{audio_file_path}] with textual query [{text}] and advanced mode [{use_advanced}]")
38
  mixture, _ = librosa.load(audio_file_path, sr=32000, mono=True)
39
 
40
  with torch.no_grad():
 
49
  "condition": conditions,
50
  }
51
 
52
+ if use_chunk:
53
+ sep_segment = model.ss_model.chunk_inference(input_dict)
54
+ sep_segment = np.squeeze(sep_segment)
55
+ else:
56
+ sep_segment = model.ss_model(input_dict)["waveform"]
57
+ sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
58
 
59
  return 32000, np.round(sep_segment * 32767).astype(np.int16)
60
 
 
65
  with gr.Column():
66
  input_audio = gr.Audio(label="Mixture", type="filepath")
67
  text = gr.Textbox(label="Text Query")
68
+ use_chunk = gr.Checkbox(label="Use Chunk Mode (Less Quality but Less Memory Used)", value=False)
69
  with gr.Column():
70
+ output_audio = gr.Audio(label="Separation Result", scale=10)
71
+ button = gr.Button(
72
+ "Separate",
73
+ variant="primary",
74
+ scale=2,
75
+ size="lg",
76
+ interactive=True,
77
+ )
78
+ button.click(
79
+ fn=inference, inputs=[input_audio, text, use_chunk], outputs=[output_audio]
80
+ )
 
81
 
82
  gr.Markdown("## Examples")
83
  gr.Examples(examples=EXAMPLES, inputs=[input_audio, text])
84
 
85
+ demo.queue().launch(share=True)