Spaces:

desiree
/

Qwen2-Audio-7B

Running on Zero

App Files Files Community

desiree commited on Dec 2, 2024

Commit

df835ed

verified ·

1 Parent(s): da7986b

Upload 3 files

Browse files

Files changed (3) hide show

README.md +25 -13
app.py +98 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
----
-title: Qwen2 Audio 7B
-emoji: 🦀
-colorFrom: blue
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.7.1
-app_file: app.py
-pinned: false
-short_description: for audio understanding
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Qwen2 Audio Demo
+This is a Hugging Face Space demo for the Qwen2-Audio-7B model. The app allows users to upload audio files and get AI-generated descriptions or answers to specific questions about the audio content.
+## Features
+- Upload audio files (supports WAV, MP3, OGG, and FLAC formats)
+- Ask specific questions about the audio content
+- Get AI-generated descriptions of the audio
+- Real-time streaming responses
+## Usage
+1. Upload an audio file using the audio input interface
+2. (Optional) Enter a specific question about the audio content
+3. Click "Submit" to get the AI's response
+4. The model will process the audio and generate a response in real-time
+## Model
+This demo uses the NexaAIDev/Qwen2-Audio-7B-GGUF model, which is optimized for audio understanding and processing.
+## Requirements
+See `requirements.txt` for a full list of dependencies.

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import os
+from threading import Thread
+import uuid
+import soundfile as sf
+import numpy as np
+# Model and Tokenizer Loading
+MODEL_ID = "NexaAIDev/Qwen2-Audio-7B-GGUF"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+DESCRIPTION = "[Qwen2-Audio-7B Demo](https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF)"
+audio_extensions = (".wav", ".mp3", ".ogg", ".flac")
+def process_audio(audio_path):
+    """Process audio file and return the appropriate format for the model."""
+    audio_data, sample_rate = sf.read(audio_path)
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
+    return audio_data, sample_rate
+@spaces.GPU
+def qwen_inference(audio_input, text_input=None):
+    if not isinstance(audio_input, str) or not audio_input.lower().endswith(audio_extensions):
+        raise ValueError("Please upload a valid audio file (WAV, MP3, OGG, or FLAC)")
+    # Process audio input
+    audio_data, sample_rate = process_audio(audio_input)
+    # Prepare the prompt
+    if text_input:
+        prompt = f"Below is an audio clip. {text_input}"
+    else:
+        prompt = "Please describe what you hear in this audio clip."
+    # Tokenize input
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    # Generate response
+    streamer = tokenizer.get_streamer()
+    generation_kwargs = dict(
+        inputs=inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        temperature=0.7,
+        do_sample=True
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tab(label="Audio Input"):
+        with gr.Row():
+            with gr.Column():
+                input_audio = gr.Audio(
+                    label="Upload Audio",
+                    type="filepath"
+                )
+                text_input = gr.Textbox(
+                    label="Question (optional)",
+                    placeholder="Ask a question about the audio or leave empty for general description"
+                )
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(
+            qwen_inference,
+            [input_audio, text_input],
+            [output_text]
+        )
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.0.0
+torch>=2.0.0
+transformers>=4.36.0
+soundfile>=0.12.1
+numpy>=1.24.0
+huggingface-hub>=0.19.0