kcz358 commited on
Commit
9cdb7cc
·
verified ·
1 Parent(s): b199ca0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
37
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
Makefile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format start clean
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
14
+
15
+ start:
16
+ gradio app.py
17
+
18
+ clean:
19
+ ps aux | grep "app" | grep -v "grep" | awk '{print $$2}' | xargs kill -9
app.py CHANGED
@@ -1,64 +1,115 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
  ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  if __name__ == "__main__":
 
 
64
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoProcessor
3
+ import librosa
4
+
5
+ def split_audio(audio_arrays, chunk_limit=480000):
6
+ CHUNK_LIM = chunk_limit
7
+ audio_splits = []
8
+ # Split the loaded audio to 30s chunks and extend the messages content
9
+ for i in range(
10
+ 0,
11
+ len(audio_arrays),
12
+ CHUNK_LIM,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ):
14
+ audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
15
+ return audio_splits
16
+
17
+
18
+ # Placeholder for your actual LLM processing API call
19
+ def process_audio(audio, text, chat_history):
20
+ conversation = [
21
+ {
22
+ "role": "user",
23
+ "content": [
24
+ ],
25
+ },
26
+ ]
27
+ audio = librosa.load(audio, sr=16000)[0]
28
+
29
+ if audio is not None:
30
+ splitted_audio = split_audio(audio)
31
+ for au in splitted_audio:
32
+ conversation[0]["content"].append(
33
+ {
34
+ "type": "audio_url",
35
+ "audio": "placeholder",
36
+ }
37
+ )
38
+ chat_history.append({"role": "user", "content": gr.Audio(value=(16000, audio))})
39
+
40
+ conversation[0]["content"].append(
41
+ {
42
+ "type": "text",
43
+ "text": text,
44
+ }
45
+ )
46
+
47
+ chat_history.append({"role": "user", "content": text})
48
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
49
+ inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
50
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
51
+ outputs = model.generate(**inputs, eos_token_id=151645, pad_token_id=151643, max_new_tokens=4096)
52
+
53
+ cont = outputs[:, inputs["input_ids"].shape[-1] :]
54
+
55
+ result = processor.batch_decode(cont, skip_special_tokens=True)[0]
56
+ chat_history.append(
57
+ {
58
+ "role": "assistant",
59
+ "content": result,
60
+ }
61
+ )
62
+
63
+ return chat_history
64
+
65
+ with gr.Blocks() as demo:
66
+ gr.Markdown("## 🎙️ Aero-1-Audio")
67
+ gr.Markdown(
68
+ """
69
+ Aero-1-Audio is a compact audio model. With only 1.5B parameters and 50k hours training data, it can perform a variety of tasks, including:
70
+ ASR, basic Audio Understanding, Audio Instruction Following, and scene analysis
71
+
72
+ We provide several examples such as:
73
+ - nvidia conference and a show from elon musk for long ASR
74
+ - Simple Audio Instruction Following
75
+ - Audio Understanding for weather and music
76
+
77
+ The model might not be able to follow your instruction in multiple cases and might be wrong in many times
78
+
79
+ """
80
+ )
81
+
82
+ chatbot = gr.Chatbot(type="messages")
83
+
84
+ with gr.Row(variant="compact", equal_height=True):
85
+ audio_input = gr.Audio(label="Speak Here", type="filepath")
86
+ text_input = gr.Textbox(label="Text Input", placeholder="Type here", interactive=True)
87
+
88
+
89
+ with gr.Row():
90
+ chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
91
+ chatbot_submit = gr.Button("Submit", variant="primary")
92
+ chatbot_submit.click(
93
+ process_audio,
94
+ inputs=[audio_input, text_input, chatbot],
95
+ outputs=[chatbot],
96
+ )
97
+
98
+ gr.Examples(
99
+ [
100
+ ["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
101
+ ["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
102
+ ["Please transcribe the audio for me", "./examples/nuggets.mp3"],
103
+ ["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
104
+ ["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
105
+ ["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
106
+ ],
107
+ inputs=[text_input, audio_input],
108
+ label="Examples",
109
+ )
110
 
111
 
112
  if __name__ == "__main__":
113
+ processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
114
+ model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
115
  demo.launch()
examples/audio_instruction.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f422585aebb2b59288267f8bd27313c36593a8c3a4686981c081edba9b323ed3
3
+ size 322284
examples/audio_understand.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee10f548b4852f6cf082ecb8f8a652981487bfdc081ee6eb7e1e4a7a6c63a30f
3
+ size 1455146
examples/elon_musk.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08642086ea9f6efa1aeb0593aacab8dd975bbd254c07863e92881b7c3aa464fa
3
+ size 2686804
examples/music_under.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ba05f979d0b66e08b6fd7eec87a6f4f1d90887111bf8de6ce8005450606d29
3
+ size 3834990
examples/nuggets.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31453a7d842b64082a0a587e4222c9e6716e3f03560d0602db5ae042a0815381
3
+ size 772564
examples/nvidia_conference.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a20df97d1fec6147accce706b61ba7e918850d75cc7044a4fa1ac72d67ad9b05
3
+ size 14659846
requirements.txt CHANGED
@@ -1 +1,5 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
1
+ huggingface_hub
2
+ librosa
3
+ transformers@git+https://github.com/huggingface/[email protected]
4
+ torch
5
+ accelerate