File size: 5,612 Bytes
4dd1b2a
f3f60d0
9cdb7cc
f3f60d0
143845f
9cdb7cc
 
 
 
 
 
 
 
 
4dd1b2a
9cdb7cc
 
 
 
f3f60d0
 
 
 
 
 
 
 
143845f
9cdb7cc
 
 
 
 
 
 
 
f3f60d0
9cdb7cc
 
 
 
 
 
 
 
 
 
 
f3f60d0
9cdb7cc
 
 
 
 
 
 
 
8e909f2
f3f60d0
 
9cdb7cc
 
 
f3f60d0
 
 
 
7ad0d4f
 
f3f60d0
 
 
7ad0d4f
f3f60d0
 
 
9cdb7cc
f3f60d0
 
 
 
9cdb7cc
 
 
 
 
161ee0d
e801e19
4b365ec
9cdb7cc
4b365ec
ae64c32
db322fd
9cdb7cc
4b365ec
 
9cdb7cc
4b365ec
9cdb7cc
 
 
 
 
 
 
b8561d7
9cdb7cc
 
 
 
 
 
f3f60d0
 
 
 
 
9cdb7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd1b2a
 
 
9cdb7cc
 
4dd1b2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer
import librosa
from threading import Thread
import spaces

def split_audio(audio_arrays, chunk_limit=480000):
    CHUNK_LIM = chunk_limit
    audio_splits = []
    # Split the loaded audio to 30s chunks and extend the messages content
    for i in range(
        0,
        len(audio_arrays),
        CHUNK_LIM,
    ):
        audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
    return audio_splits


def user(audio, text, chat_history):

    if audio is not None:
        chat_history.append(gr.ChatMessage(role="user", content={"path": audio, "alt_text": "Audio"}))
    chat_history.append({"role": "user", "content": text})
    return "", chat_history


@spaces.GPU
def process_audio(audio, text, chat_history):
    conversation = [
        {
            "role": "user",
            "content": [
            ],
        },
    ]
    audio_path = audio
    audio = librosa.load(audio, sr=16000)[0]

    if audio is not None:
        splitted_audio = split_audio(audio)
        for au in splitted_audio:
            conversation[0]["content"].append(
                {
                    "type": "audio_url",
                    "audio": "placeholder",
                }
            )
        # chat_history.append(gr.ChatMessage(role="user", content={"path": audio_path, "alt_text": "Audio"}))
    
    conversation[0]["content"].append(
        {
            "type": "text",
            "text": text,
        }
    )

    # chat_history.append({"role": "user", "content": text})
    # Set up the streamer for token generation
    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    # Set up generation arguments including max tokens and streamer
    generation_args = {
        "max_new_tokens": 4096,
        "streamer": streamer,
        "eos_token_id":151645, 
        "pad_token_id":151643,
        **inputs
    }
    # Start a separate thread for model generation to allow streaming output
    chat_history.append({"role": "assistant", "content": ""})
    thread = Thread(
        target=model.generate,
        kwargs=generation_args,
    )
    thread.start()
    for character in streamer:
        chat_history[-1]['content'] += character
        yield chat_history 

with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Aero-1-Audio")
    gr.Markdown(
    """
    ![logo](./examples/aero-1-audio.png)
    
    Aero-1-Audio is a lightweight audio-language model with only 1.5 billion parameters, trained on 50,000 hours of high-quality audio data. Despite its compact size, it supports a wide range of tasks, such as Automatic Speech Recognition (ASR), Basic Audio Understanding, Audio Instruction Following, and Scene Audio Analysis.

    Notably, Aero-1-Audio excels at lossless ASR on ultra-long audio—up to 16 minutes—without the need for audio segmentation.

    [Github](https://github.com/EvolvingLMMs-Lab/Aero-1/blob/main/README.md) | [Playground](https://huggingface.co/spaces/lmms-lab/Aero-1-Audio-Demo) | [Model Checkpoints](https://huggingface.co/lmms-lab/Aero-1-Audio-1.5B) | [Evaluation Results](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/658) | [Cookbook](https://www.lmms-lab.com/posts/lmms-lab-docs/aero_audio/)

    To explore its capabilities, you can upload your own audio or record your voice directly.
    Or simply start by trying the example demo below.

    ⚠️ Disclaimer: Aero-1-Audio is still under active development. Occasional inaccuracies may occur. We appreciate your understanding and welcome any feedback to help us make it better.
    """
    )

    chatbot = gr.Chatbot(type="messages")

    with gr.Row(variant="compact", equal_height=True):
        audio_input = gr.Audio(label="Speak Here", type="filepath")
        text_input = gr.Textbox(label="Text Input", placeholder="Please transcribe this audio for me", interactive=True)

    
    with gr.Row():
        chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
        chatbot_submit = gr.Button("Submit", variant="primary")
        chatbot_submit.click(
            user,
            inputs=[audio_input, text_input, chatbot],
            outputs=[text_input, chatbot],
            queue=False
        ).then(
            process_audio,
            inputs=[audio_input, text_input, chatbot],
            outputs=[chatbot],
        )
    
    gr.Examples(
        [
            ["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
            ["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
            ["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
            ["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
            ["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
        ],
        inputs=[text_input, audio_input],
        label="Examples",
    )


if __name__ == "__main__":
    processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
    demo.launch()