Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -1,27 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
-
import spaces
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
import torch
|
5 |
-
import os
|
6 |
-
from threading import Thread
|
7 |
-
import uuid
|
8 |
import soundfile as sf
|
9 |
import numpy as np
|
10 |
-
from transformers.generation import TextIteratorStreamer
|
11 |
|
12 |
# Model and Tokenizer Loading
|
13 |
MODEL_ID = "Qwen/Qwen-Audio-Chat"
|
14 |
-
model = AutoModelForCausalLM.from_pretrained(
|
15 |
-
MODEL_ID,
|
16 |
-
torch_dtype=torch.float16,
|
17 |
-
device_map="auto",
|
18 |
-
trust_remote_code=True
|
19 |
-
)
|
20 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
21 |
-
|
22 |
-
DESCRIPTION = "[Qwen-Audio-Chat Demo](https://huggingface.co/Qwen/Qwen-Audio-Chat)"
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def process_audio(audio_path):
|
27 |
"""Process audio file and return the appropriate format for the model."""
|
@@ -30,27 +24,29 @@ def process_audio(audio_path):
|
|
30 |
audio_data = audio_data.mean(axis=1) # Convert stereo to mono if necessary
|
31 |
return audio_data, sample_rate
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
# Prepare the messages
|
42 |
-
if text_input:
|
43 |
-
query = text_input
|
44 |
-
else:
|
45 |
-
query = "Please describe what you hear in this audio clip."
|
46 |
-
|
47 |
messages = [
|
48 |
{
|
49 |
"role": "user",
|
50 |
"content": [
|
51 |
{
|
52 |
"type": "audio",
|
53 |
-
"audio":
|
54 |
},
|
55 |
{
|
56 |
"type": "text",
|
@@ -60,7 +56,6 @@ def qwen_inference(audio_input, text_input=None):
|
|
60 |
}
|
61 |
]
|
62 |
|
63 |
-
# Convert messages to model input format
|
64 |
text = tokenizer.apply_chat_template(
|
65 |
messages,
|
66 |
tokenize=False,
|
@@ -68,56 +63,33 @@ def qwen_inference(audio_input, text_input=None):
|
|
68 |
)
|
69 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
temperature=0.7,
|
78 |
-
do_sample=True
|
79 |
-
)
|
80 |
-
|
81 |
-
# Start generation in a separate thread
|
82 |
-
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
83 |
-
thread.start()
|
84 |
-
|
85 |
-
# Stream the output
|
86 |
-
buffer = ""
|
87 |
-
for new_text in streamer:
|
88 |
-
buffer += new_text
|
89 |
-
yield buffer
|
90 |
-
|
91 |
-
css = """
|
92 |
-
#output {
|
93 |
-
height: 500px;
|
94 |
-
overflow: auto;
|
95 |
-
border: 1px solid #ccc;
|
96 |
-
}
|
97 |
-
"""
|
98 |
-
|
99 |
-
with gr.Blocks(css=css) as demo:
|
100 |
-
gr.Markdown(DESCRIPTION)
|
101 |
-
|
102 |
-
with gr.Tab(label="Audio Input"):
|
103 |
-
with gr.Row():
|
104 |
-
with gr.Column():
|
105 |
-
input_audio = gr.Audio(
|
106 |
-
label="Upload Audio",
|
107 |
-
type="filepath"
|
108 |
-
)
|
109 |
-
text_input = gr.Textbox(
|
110 |
-
label="Question (optional)",
|
111 |
-
placeholder="Ask a question about the audio or leave empty for general description"
|
112 |
-
)
|
113 |
-
submit_btn = gr.Button(value="Submit")
|
114 |
-
with gr.Column():
|
115 |
-
output_text = gr.Textbox(label="Output Text")
|
116 |
-
|
117 |
-
submit_btn.click(
|
118 |
-
qwen_inference,
|
119 |
-
[input_audio, text_input],
|
120 |
-
[output_text]
|
121 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
import torch
|
|
|
|
|
|
|
4 |
import soundfile as sf
|
5 |
import numpy as np
|
|
|
6 |
|
7 |
# Model and Tokenizer Loading
|
8 |
MODEL_ID = "Qwen/Qwen-Audio-Chat"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
def load_model():
|
11 |
+
model = AutoModelForCausalLM.from_pretrained(
|
12 |
+
MODEL_ID,
|
13 |
+
torch_dtype=torch.float16,
|
14 |
+
device_map="auto",
|
15 |
+
trust_remote_code=True
|
16 |
+
)
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
18 |
+
return model, tokenizer
|
19 |
|
20 |
def process_audio(audio_path):
|
21 |
"""Process audio file and return the appropriate format for the model."""
|
|
|
24 |
audio_data = audio_data.mean(axis=1) # Convert stereo to mono if necessary
|
25 |
return audio_data, sample_rate
|
26 |
|
27 |
+
def analyze_audio(audio_path: str, question: str = None) -> str:
|
28 |
+
"""
|
29 |
+
Main function for audio analysis that will be exposed as a tool.
|
30 |
+
Args:
|
31 |
+
audio_path: Path to the audio file
|
32 |
+
question: Optional question about the audio
|
33 |
+
Returns:
|
34 |
+
str: Model's response about the audio
|
35 |
+
"""
|
36 |
+
model, tokenizer = load_model()
|
37 |
+
|
38 |
+
if not audio_path:
|
39 |
+
return "Please provide an audio file."
|
40 |
+
|
41 |
+
query = question if question else "Please describe what you hear in this audio clip."
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
messages = [
|
44 |
{
|
45 |
"role": "user",
|
46 |
"content": [
|
47 |
{
|
48 |
"type": "audio",
|
49 |
+
"audio": audio_path,
|
50 |
},
|
51 |
{
|
52 |
"type": "text",
|
|
|
56 |
}
|
57 |
]
|
58 |
|
|
|
59 |
text = tokenizer.apply_chat_template(
|
60 |
messages,
|
61 |
tokenize=False,
|
|
|
63 |
)
|
64 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
65 |
|
66 |
+
with torch.no_grad():
|
67 |
+
outputs = model.generate(
|
68 |
+
**model_inputs,
|
69 |
+
max_new_tokens=512,
|
70 |
+
temperature=0.7,
|
71 |
+
do_sample=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
)
|
73 |
+
|
74 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
75 |
+
return response
|
76 |
+
|
77 |
+
# Create Gradio interface with clear input/output specifications
|
78 |
+
demo = gr.Interface(
|
79 |
+
fn=analyze_audio,
|
80 |
+
inputs=[
|
81 |
+
gr.Audio(type="filepath", label="Audio Input"),
|
82 |
+
gr.Textbox(label="Question", placeholder="Optional: Ask a specific question about the audio")
|
83 |
+
],
|
84 |
+
outputs=gr.Textbox(label="Analysis"),
|
85 |
+
title="Qwen Audio Analysis Tool",
|
86 |
+
description="Upload an audio file to get AI-powered analysis using Qwen-Audio-Chat model",
|
87 |
+
examples=[
|
88 |
+
["path/to/example1.wav", "What instruments do you hear?"],
|
89 |
+
["path/to/example2.wav", "Describe the mood of this audio."]
|
90 |
+
],
|
91 |
+
cache_examples=False
|
92 |
+
)
|
93 |
|
94 |
+
if __name__ == "__main__":
|
95 |
+
demo.launch()
|