Spaces:

desiree
/

Qwen2-Audio-7B

Running on Zero

App Files Files Community

desiree commited on Dec 4, 2024

Commit

cf8c971

verified ·

1 Parent(s): 03806c4

Upload app.py

Browse files

Files changed (1) hide show

app.py +53 -81

app.py CHANGED Viewed

@@ -1,27 +1,21 @@
 import gradio as gr
-import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-import os
-from threading import Thread
-import uuid
 import soundfile as sf
 import numpy as np
-from transformers.generation import TextIteratorStreamer
 # Model and Tokenizer Loading
 MODEL_ID = "Qwen/Qwen-Audio-Chat"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-DESCRIPTION = "[Qwen-Audio-Chat Demo](https://huggingface.co/Qwen/Qwen-Audio-Chat)"
-audio_extensions = (".wav", ".mp3", ".ogg", ".flac")
 def process_audio(audio_path):
     """Process audio file and return the appropriate format for the model."""
@@ -30,27 +24,29 @@ def process_audio(audio_path):
         audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
     return audio_data, sample_rate
-@spaces.GPU
-def qwen_inference(audio_input, text_input=None):
-    if not isinstance(audio_input, str) or not audio_input.lower().endswith(audio_extensions):
-        raise ValueError("Please upload a valid audio file (WAV, MP3, OGG, or FLAC)")
-    # Process audio input
-    audio_data, sample_rate = process_audio(audio_input)
-    # Prepare the messages
-    if text_input:
-        query = text_input
-    else:
-        query = "Please describe what you hear in this audio clip."
     messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "audio",
-                    "audio": audio_input,
                 },
                 {
                     "type": "text",
@@ -60,7 +56,6 @@ def qwen_inference(audio_input, text_input=None):
         }
     ]
-    # Convert messages to model input format
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
@@ -68,56 +63,33 @@ def qwen_inference(audio_input, text_input=None):
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Set up streamer for real-time output
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        model_inputs,
-        streamer=streamer,
-        max_new_tokens=512,
-        temperature=0.7,
-        do_sample=True
-    )
-    # Start generation in a separate thread
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream the output
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer
-css = """
-  #output {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
-  }
-"""
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Tab(label="Audio Input"):
-        with gr.Row():
-            with gr.Column():
-                input_audio = gr.Audio(
-                    label="Upload Audio",
-                    type="filepath"
-                )
-                text_input = gr.Textbox(
-                    label="Question (optional)",
-                    placeholder="Ask a question about the audio or leave empty for general description"
-                )
-                submit_btn = gr.Button(value="Submit")
-            with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-        submit_btn.click(
-            qwen_inference,
-            [input_audio, text_input],
-            [output_text]
         )
-demo.launch(debug=True)

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import soundfile as sf
 import numpy as np
 # Model and Tokenizer Loading
 MODEL_ID = "Qwen/Qwen-Audio-Chat"
+def load_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    return model, tokenizer
 def process_audio(audio_path):
     """Process audio file and return the appropriate format for the model."""
         audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
     return audio_data, sample_rate
+def analyze_audio(audio_path: str, question: str = None) -> str:
+    """
+    Main function for audio analysis that will be exposed as a tool.
+    Args:
+        audio_path: Path to the audio file
+        question: Optional question about the audio
+    Returns:
+        str: Model's response about the audio
+    """
+    model, tokenizer = load_model()
+    if not audio_path:
+        return "Please provide an audio file."
+    query = question if question else "Please describe what you hear in this audio clip."
     messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "audio",
+                    "audio": audio_path,
                 },
                 {
                     "type": "text",
         }
     ]
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **model_inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            do_sample=True
         )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response
+# Create Gradio interface with clear input/output specifications
+demo = gr.Interface(
+    fn=analyze_audio,
+    inputs=[
+        gr.Audio(type="filepath", label="Audio Input"),
+        gr.Textbox(label="Question", placeholder="Optional: Ask a specific question about the audio")
+    ],
+    outputs=gr.Textbox(label="Analysis"),
+    title="Qwen Audio Analysis Tool",
+    description="Upload an audio file to get AI-powered analysis using Qwen-Audio-Chat model",
+    examples=[
+        ["path/to/example1.wav", "What instruments do you hear?"],
+        ["path/to/example2.wav", "Describe the mood of this audio."]
+    ],
+    cache_examples=False
+)
+if __name__ == "__main__":
+    demo.launch()