desiree commited on
Commit
df835ed
·
verified ·
1 Parent(s): da7986b

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +25 -13
  2. app.py +98 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,25 @@
1
- ---
2
- title: Qwen2 Audio 7B
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.7.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: for audio understanding
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen2 Audio Demo
2
+
3
+ This is a Hugging Face Space demo for the Qwen2-Audio-7B model. The app allows users to upload audio files and get AI-generated descriptions or answers to specific questions about the audio content.
4
+
5
+ ## Features
6
+
7
+ - Upload audio files (supports WAV, MP3, OGG, and FLAC formats)
8
+ - Ask specific questions about the audio content
9
+ - Get AI-generated descriptions of the audio
10
+ - Real-time streaming responses
11
+
12
+ ## Usage
13
+
14
+ 1. Upload an audio file using the audio input interface
15
+ 2. (Optional) Enter a specific question about the audio content
16
+ 3. Click "Submit" to get the AI's response
17
+ 4. The model will process the audio and generate a response in real-time
18
+
19
+ ## Model
20
+
21
+ This demo uses the NexaAIDev/Qwen2-Audio-7B-GGUF model, which is optimized for audio understanding and processing.
22
+
23
+ ## Requirements
24
+
25
+ See `requirements.txt` for a full list of dependencies.
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import torch
5
+ import os
6
+ from threading import Thread
7
+ import uuid
8
+ import soundfile as sf
9
+ import numpy as np
10
+
11
+ # Model and Tokenizer Loading
12
+ MODEL_ID = "NexaAIDev/Qwen2-Audio-7B-GGUF"
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ MODEL_ID,
15
+ trust_remote_code=True,
16
+ torch_dtype=torch.float16
17
+ ).to("cuda").eval()
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
19
+
20
+ DESCRIPTION = "[Qwen2-Audio-7B Demo](https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF)"
21
+
22
+ audio_extensions = (".wav", ".mp3", ".ogg", ".flac")
23
+
24
+ def process_audio(audio_path):
25
+ """Process audio file and return the appropriate format for the model."""
26
+ audio_data, sample_rate = sf.read(audio_path)
27
+ if len(audio_data.shape) > 1:
28
+ audio_data = audio_data.mean(axis=1) # Convert stereo to mono if necessary
29
+ return audio_data, sample_rate
30
+
31
+ @spaces.GPU
32
+ def qwen_inference(audio_input, text_input=None):
33
+ if not isinstance(audio_input, str) or not audio_input.lower().endswith(audio_extensions):
34
+ raise ValueError("Please upload a valid audio file (WAV, MP3, OGG, or FLAC)")
35
+
36
+ # Process audio input
37
+ audio_data, sample_rate = process_audio(audio_input)
38
+
39
+ # Prepare the prompt
40
+ if text_input:
41
+ prompt = f"Below is an audio clip. {text_input}"
42
+ else:
43
+ prompt = "Please describe what you hear in this audio clip."
44
+
45
+ # Tokenize input
46
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
47
+
48
+ # Generate response
49
+ streamer = tokenizer.get_streamer()
50
+ generation_kwargs = dict(
51
+ inputs=inputs,
52
+ streamer=streamer,
53
+ max_new_tokens=512,
54
+ temperature=0.7,
55
+ do_sample=True
56
+ )
57
+
58
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
59
+ thread.start()
60
+
61
+ buffer = ""
62
+ for new_text in streamer:
63
+ buffer += new_text
64
+ yield buffer
65
+
66
+ css = """
67
+ #output {
68
+ height: 500px;
69
+ overflow: auto;
70
+ border: 1px solid #ccc;
71
+ }
72
+ """
73
+
74
+ with gr.Blocks(css=css) as demo:
75
+ gr.Markdown(DESCRIPTION)
76
+
77
+ with gr.Tab(label="Audio Input"):
78
+ with gr.Row():
79
+ with gr.Column():
80
+ input_audio = gr.Audio(
81
+ label="Upload Audio",
82
+ type="filepath"
83
+ )
84
+ text_input = gr.Textbox(
85
+ label="Question (optional)",
86
+ placeholder="Ask a question about the audio or leave empty for general description"
87
+ )
88
+ submit_btn = gr.Button(value="Submit")
89
+ with gr.Column():
90
+ output_text = gr.Textbox(label="Output Text")
91
+
92
+ submit_btn.click(
93
+ qwen_inference,
94
+ [input_audio, text_input],
95
+ [output_text]
96
+ )
97
+
98
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=2.0.0
3
+ transformers>=4.36.0
4
+ soundfile>=0.12.1
5
+ numpy>=1.24.0
6
+ huggingface-hub>=0.19.0