Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,612 Bytes
4dd1b2a f3f60d0 9cdb7cc f3f60d0 143845f 9cdb7cc 4dd1b2a 9cdb7cc f3f60d0 143845f 9cdb7cc f3f60d0 9cdb7cc f3f60d0 9cdb7cc 8e909f2 f3f60d0 9cdb7cc f3f60d0 7ad0d4f f3f60d0 7ad0d4f f3f60d0 9cdb7cc f3f60d0 9cdb7cc 161ee0d e801e19 4b365ec 9cdb7cc 4b365ec ae64c32 db322fd 9cdb7cc 4b365ec 9cdb7cc 4b365ec 9cdb7cc b8561d7 9cdb7cc f3f60d0 9cdb7cc 4dd1b2a 9cdb7cc 4dd1b2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer
import librosa
from threading import Thread
import spaces
def split_audio(audio_arrays, chunk_limit=480000):
CHUNK_LIM = chunk_limit
audio_splits = []
# Split the loaded audio to 30s chunks and extend the messages content
for i in range(
0,
len(audio_arrays),
CHUNK_LIM,
):
audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
return audio_splits
def user(audio, text, chat_history):
if audio is not None:
chat_history.append(gr.ChatMessage(role="user", content={"path": audio, "alt_text": "Audio"}))
chat_history.append({"role": "user", "content": text})
return "", chat_history
@spaces.GPU
def process_audio(audio, text, chat_history):
conversation = [
{
"role": "user",
"content": [
],
},
]
audio_path = audio
audio = librosa.load(audio, sr=16000)[0]
if audio is not None:
splitted_audio = split_audio(audio)
for au in splitted_audio:
conversation[0]["content"].append(
{
"type": "audio_url",
"audio": "placeholder",
}
)
# chat_history.append(gr.ChatMessage(role="user", content={"path": audio_path, "alt_text": "Audio"}))
conversation[0]["content"].append(
{
"type": "text",
"text": text,
}
)
# chat_history.append({"role": "user", "content": text})
# Set up the streamer for token generation
streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Set up generation arguments including max tokens and streamer
generation_args = {
"max_new_tokens": 4096,
"streamer": streamer,
"eos_token_id":151645,
"pad_token_id":151643,
**inputs
}
# Start a separate thread for model generation to allow streaming output
chat_history.append({"role": "assistant", "content": ""})
thread = Thread(
target=model.generate,
kwargs=generation_args,
)
thread.start()
for character in streamer:
chat_history[-1]['content'] += character
yield chat_history
with gr.Blocks() as demo:
gr.Markdown("## 🎙️ Aero-1-Audio")
gr.Markdown(
"""

Aero-1-Audio is a lightweight audio-language model with only 1.5 billion parameters, trained on 50,000 hours of high-quality audio data. Despite its compact size, it supports a wide range of tasks, such as Automatic Speech Recognition (ASR), Basic Audio Understanding, Audio Instruction Following, and Scene Audio Analysis.
Notably, Aero-1-Audio excels at lossless ASR on ultra-long audio—up to 16 minutes—without the need for audio segmentation.
[Github](https://github.com/EvolvingLMMs-Lab/Aero-1/blob/main/README.md) | [Playground](https://huggingface.co/spaces/lmms-lab/Aero-1-Audio-Demo) | [Model Checkpoints](https://huggingface.co/lmms-lab/Aero-1-Audio-1.5B) | [Evaluation Results](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/658) | [Cookbook](https://www.lmms-lab.com/posts/lmms-lab-docs/aero_audio/)
To explore its capabilities, you can upload your own audio or record your voice directly.
Or simply start by trying the example demo below.
⚠️ Disclaimer: Aero-1-Audio is still under active development. Occasional inaccuracies may occur. We appreciate your understanding and welcome any feedback to help us make it better.
"""
)
chatbot = gr.Chatbot(type="messages")
with gr.Row(variant="compact", equal_height=True):
audio_input = gr.Audio(label="Speak Here", type="filepath")
text_input = gr.Textbox(label="Text Input", placeholder="Please transcribe this audio for me", interactive=True)
with gr.Row():
chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
chatbot_submit = gr.Button("Submit", variant="primary")
chatbot_submit.click(
user,
inputs=[audio_input, text_input, chatbot],
outputs=[text_input, chatbot],
queue=False
).then(
process_audio,
inputs=[audio_input, text_input, chatbot],
outputs=[chatbot],
)
gr.Examples(
[
["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
],
inputs=[text_input, audio_input],
label="Examples",
)
if __name__ == "__main__":
processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
demo.launch()
|