Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,990 Bytes
7abb7ba df835ed 7435993 df835ed b50b5f5 c69cd11 7435993 c877679 df835ed 7abb7ba df835ed cf8c971 7435993 cf8c971 df835ed b50b5f5 0a24813 7435993 0a24813 7435993 0a24813 7435993 c69cd11 7435993 c69cd11 7435993 0a24813 21a98db fff885e df835ed c877679 cf8c971 b50b5f5 7435993 b50b5f5 7435993 0a24813 bb4fe56 b50b5f5 56709e2 7435993 b50b5f5 7435993 b50b5f5 7abb7ba dc7c80c b50b5f5 7435993 b50b5f5 56709e2 b50b5f5 7435993 dc7c80c 56709e2 7435993 56709e2 21a98db cf8c971 b50b5f5 c69cd11 21a98db b50b5f5 cf8c971 b50b5f5 cf8c971 d64de19 cf8c971 df835ed cf8c971 bb4fe56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import soundfile as sf
import numpy as np
import os
from io import BytesIO
import base64
import spaces
# Model and Tokenizer Loading
MODEL_ID = "Qwen/Qwen-Audio-Chat"
def load_model():
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
chat_template = """<s>[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>
{% for message in messages %}
{{ message['role'] }}: {{ message['content'] }}
{% endfor %}[/INST]"""
tokenizer.chat_template = chat_template
return model, tokenizer
def process_audio(audio_path):
try:
audio_data, sample_rate = sf.read(audio_path)
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
audio_data = audio_data.astype(np.float32)
audio_buffer = BytesIO()
sf.write(audio_buffer, audio_data, sample_rate, format='WAV')
audio_buffer.seek(0)
audio_base64 = base64.b64encode(audio_buffer.read()).decode('utf-8')
return {
"audio": audio_base64,
"sampling_rate": sample_rate
}
except Exception:
return None
@spaces.GPU
def analyze_audio(audio_path: str, question: str = None) -> str:
if audio_path is None or not isinstance(audio_path, str):
return "Please provide a valid audio file."
if not os.path.exists(audio_path):
return f"Audio file not found: {audio_path}"
audio_data = process_audio(audio_path)
if not audio_data or "audio" not in audio_data or "sampling_rate" not in audio_data:
return "Failed to process the audio file. Please ensure it's a valid audio format."
try:
model, tokenizer = load_model()
query = question if question else "Please describe what you hear in this audio clip."
messages = [
{
"role": "user",
"content": query
}
]
if tokenizer.chat_template:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
else:
raise ValueError("Tokenizer chat_template is not set.")
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**model_inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)
if outputs is None or len(outputs) == 0:
return "The model failed to generate a response. Please try again."
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
except Exception:
return "An error occurred while processing. Please check your inputs and try again."
demo = gr.Interface(
fn=analyze_audio,
inputs=[
gr.Audio(
type="filepath",
label="Audio Input",
sources=["upload", "microphone"],
format="mp3"
),
gr.Textbox(
label="Question",
placeholder="Optional: Ask a specific question about the audio",
value=""
)
],
outputs=gr.Textbox(label="Analysis"),
title="Qwen Audio Analysis Tool",
description="Upload an audio file or record from microphone to get AI-powered analysis using Qwen-Audio-Chat model",
examples=[
["example1.wav", "What instruments do you hear?"]
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()
|