File size: 3,990 Bytes
7abb7ba
df835ed
7435993
df835ed
 
b50b5f5
c69cd11
7435993
c877679
df835ed
 
7abb7ba
df835ed
cf8c971
 
 
 
 
 
 
 
7435993
 
 
 
 
 
 
 
 
cf8c971
df835ed
 
b50b5f5
0a24813
7435993
0a24813
 
7435993
0a24813
7435993
c69cd11
 
7435993
c69cd11
 
7435993
0a24813
 
 
 
21a98db
fff885e
df835ed
c877679
cf8c971
b50b5f5
 
7435993
b50b5f5
 
7435993
0a24813
bb4fe56
b50b5f5
 
 
56709e2
 
7435993
b50b5f5
 
 
7435993
b50b5f5
 
7abb7ba
dc7c80c
 
 
 
 
 
 
 
 
b50b5f5
7435993
b50b5f5
 
 
 
 
56709e2
 
 
 
b50b5f5
7435993
dc7c80c
56709e2
7435993
56709e2
 
 
21a98db
 
cf8c971
 
 
 
b50b5f5
 
 
c69cd11
21a98db
b50b5f5
 
 
 
 
 
cf8c971
 
 
b50b5f5
cf8c971
d64de19
cf8c971
 
 
df835ed
cf8c971
bb4fe56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import soundfile as sf
import numpy as np
import os
from io import BytesIO
import base64
import spaces

# Model and Tokenizer Loading
MODEL_ID = "Qwen/Qwen-Audio-Chat"

def load_model():
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    chat_template = """<s>[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>
{% for message in messages %}
{{ message['role'] }}: {{ message['content'] }}
{% endfor %}[/INST]"""

    tokenizer.chat_template = chat_template
    return model, tokenizer

def process_audio(audio_path):
    try:
        audio_data, sample_rate = sf.read(audio_path)

        if len(audio_data.shape) > 1:
            audio_data = audio_data.mean(axis=1)

        audio_data = audio_data.astype(np.float32)

        audio_buffer = BytesIO()
        sf.write(audio_buffer, audio_data, sample_rate, format='WAV')

        audio_buffer.seek(0)
        audio_base64 = base64.b64encode(audio_buffer.read()).decode('utf-8')

        return {
            "audio": audio_base64,
            "sampling_rate": sample_rate
        }
    except Exception:
        return None

@spaces.GPU
def analyze_audio(audio_path: str, question: str = None) -> str:
    if audio_path is None or not isinstance(audio_path, str):
        return "Please provide a valid audio file."

    if not os.path.exists(audio_path):
        return f"Audio file not found: {audio_path}"

    audio_data = process_audio(audio_path)
    if not audio_data or "audio" not in audio_data or "sampling_rate" not in audio_data:
        return "Failed to process the audio file. Please ensure it's a valid audio format."

    try:
        model, tokenizer = load_model()
        query = question if question else "Please describe what you hear in this audio clip."

        messages = [
            {
                "role": "user",
                "content": query
            }
        ]

        if tokenizer.chat_template:
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
        else:
            raise ValueError("Tokenizer chat_template is not set.")

        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **model_inputs,
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            if outputs is None or len(outputs) == 0:
                return "The model failed to generate a response. Please try again."

            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return response

    except Exception:
        return "An error occurred while processing. Please check your inputs and try again."

demo = gr.Interface(
    fn=analyze_audio,
    inputs=[
        gr.Audio(
            type="filepath",
            label="Audio Input",
            sources=["upload", "microphone"],
            format="mp3"
        ),
        gr.Textbox(
            label="Question",
            placeholder="Optional: Ask a specific question about the audio",
            value=""
        )
    ],
    outputs=gr.Textbox(label="Analysis"),
    title="Qwen Audio Analysis Tool",
    description="Upload an audio file or record from microphone to get AI-powered analysis using Qwen-Audio-Chat model",
    examples=[
        ["example1.wav", "What instruments do you hear?"]
    ],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()