Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gitignore +1 -0
- Makefile +19 -0
- app.py +109 -58
- examples/audio_instruction.wav +3 -0
- examples/audio_understand.wav +3 -0
- examples/elon_musk.mp3 +3 -0
- examples/music_under.wav +3 -0
- examples/nuggets.mp3 +3 -0
- examples/nvidia_conference.mp3 +3 -0
- requirements.txt +5 -1
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
Makefile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format start clean
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
14 |
+
|
15 |
+
start:
|
16 |
+
gradio app.py
|
17 |
+
|
18 |
+
clean:
|
19 |
+
ps aux | grep "app" | grep -v "grep" | awk '{print $$2}' | xargs kill -9
|
app.py
CHANGED
@@ -1,64 +1,115 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
-
|
20 |
-
for val in history:
|
21 |
-
if val[0]:
|
22 |
-
messages.append({"role": "user", "content": val[0]})
|
23 |
-
if val[1]:
|
24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
25 |
-
|
26 |
-
messages.append({"role": "user", "content": message})
|
27 |
-
|
28 |
-
response = ""
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
):
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
"""
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
|
63 |
if __name__ == "__main__":
|
|
|
|
|
64 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
def split_audio(audio_arrays, chunk_limit=480000):
|
6 |
+
CHUNK_LIM = chunk_limit
|
7 |
+
audio_splits = []
|
8 |
+
# Split the loaded audio to 30s chunks and extend the messages content
|
9 |
+
for i in range(
|
10 |
+
0,
|
11 |
+
len(audio_arrays),
|
12 |
+
CHUNK_LIM,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
):
|
14 |
+
audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
|
15 |
+
return audio_splits
|
16 |
+
|
17 |
+
|
18 |
+
# Placeholder for your actual LLM processing API call
|
19 |
+
def process_audio(audio, text, chat_history):
|
20 |
+
conversation = [
|
21 |
+
{
|
22 |
+
"role": "user",
|
23 |
+
"content": [
|
24 |
+
],
|
25 |
+
},
|
26 |
+
]
|
27 |
+
audio = librosa.load(audio, sr=16000)[0]
|
28 |
+
|
29 |
+
if audio is not None:
|
30 |
+
splitted_audio = split_audio(audio)
|
31 |
+
for au in splitted_audio:
|
32 |
+
conversation[0]["content"].append(
|
33 |
+
{
|
34 |
+
"type": "audio_url",
|
35 |
+
"audio": "placeholder",
|
36 |
+
}
|
37 |
+
)
|
38 |
+
chat_history.append({"role": "user", "content": gr.Audio(value=(16000, audio))})
|
39 |
+
|
40 |
+
conversation[0]["content"].append(
|
41 |
+
{
|
42 |
+
"type": "text",
|
43 |
+
"text": text,
|
44 |
+
}
|
45 |
+
)
|
46 |
+
|
47 |
+
chat_history.append({"role": "user", "content": text})
|
48 |
+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
49 |
+
inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
|
50 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
51 |
+
outputs = model.generate(**inputs, eos_token_id=151645, pad_token_id=151643, max_new_tokens=4096)
|
52 |
+
|
53 |
+
cont = outputs[:, inputs["input_ids"].shape[-1] :]
|
54 |
+
|
55 |
+
result = processor.batch_decode(cont, skip_special_tokens=True)[0]
|
56 |
+
chat_history.append(
|
57 |
+
{
|
58 |
+
"role": "assistant",
|
59 |
+
"content": result,
|
60 |
+
}
|
61 |
+
)
|
62 |
+
|
63 |
+
return chat_history
|
64 |
+
|
65 |
+
with gr.Blocks() as demo:
|
66 |
+
gr.Markdown("## 🎙️ Aero-1-Audio")
|
67 |
+
gr.Markdown(
|
68 |
+
"""
|
69 |
+
Aero-1-Audio is a compact audio model. With only 1.5B parameters and 50k hours training data, it can perform a variety of tasks, including:
|
70 |
+
ASR, basic Audio Understanding, Audio Instruction Following, and scene analysis
|
71 |
+
|
72 |
+
We provide several examples such as:
|
73 |
+
- nvidia conference and a show from elon musk for long ASR
|
74 |
+
- Simple Audio Instruction Following
|
75 |
+
- Audio Understanding for weather and music
|
76 |
+
|
77 |
+
The model might not be able to follow your instruction in multiple cases and might be wrong in many times
|
78 |
+
|
79 |
+
"""
|
80 |
+
)
|
81 |
+
|
82 |
+
chatbot = gr.Chatbot(type="messages")
|
83 |
+
|
84 |
+
with gr.Row(variant="compact", equal_height=True):
|
85 |
+
audio_input = gr.Audio(label="Speak Here", type="filepath")
|
86 |
+
text_input = gr.Textbox(label="Text Input", placeholder="Type here", interactive=True)
|
87 |
+
|
88 |
+
|
89 |
+
with gr.Row():
|
90 |
+
chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
|
91 |
+
chatbot_submit = gr.Button("Submit", variant="primary")
|
92 |
+
chatbot_submit.click(
|
93 |
+
process_audio,
|
94 |
+
inputs=[audio_input, text_input, chatbot],
|
95 |
+
outputs=[chatbot],
|
96 |
+
)
|
97 |
+
|
98 |
+
gr.Examples(
|
99 |
+
[
|
100 |
+
["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
|
101 |
+
["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
|
102 |
+
["Please transcribe the audio for me", "./examples/nuggets.mp3"],
|
103 |
+
["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
|
104 |
+
["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
|
105 |
+
["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
|
106 |
+
],
|
107 |
+
inputs=[text_input, audio_input],
|
108 |
+
label="Examples",
|
109 |
+
)
|
110 |
|
111 |
|
112 |
if __name__ == "__main__":
|
113 |
+
processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
|
114 |
+
model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
|
115 |
demo.launch()
|
examples/audio_instruction.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f422585aebb2b59288267f8bd27313c36593a8c3a4686981c081edba9b323ed3
|
3 |
+
size 322284
|
examples/audio_understand.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee10f548b4852f6cf082ecb8f8a652981487bfdc081ee6eb7e1e4a7a6c63a30f
|
3 |
+
size 1455146
|
examples/elon_musk.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08642086ea9f6efa1aeb0593aacab8dd975bbd254c07863e92881b7c3aa464fa
|
3 |
+
size 2686804
|
examples/music_under.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31ba05f979d0b66e08b6fd7eec87a6f4f1d90887111bf8de6ce8005450606d29
|
3 |
+
size 3834990
|
examples/nuggets.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31453a7d842b64082a0a587e4222c9e6716e3f03560d0602db5ae042a0815381
|
3 |
+
size 772564
|
examples/nvidia_conference.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a20df97d1fec6147accce706b61ba7e918850d75cc7044a4fa1ac72d67ad9b05
|
3 |
+
size 14659846
|
requirements.txt
CHANGED
@@ -1 +1,5 @@
|
|
1 |
-
huggingface_hub
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
librosa
|
3 |
+
transformers@git+https://github.com/huggingface/[email protected]
|
4 |
+
torch
|
5 |
+
accelerate
|