Spaces:

desiree
/

Qwen2-Audio-7B

Running on Zero

App Files Files Community

desiree commited on Dec 4, 2024

Commit

fff885e

verified ·

1 Parent(s): 56709e2

Upload app.py

Browse files

Files changed (1) hide show

app.py +47 -19

app.py CHANGED Viewed

@@ -5,12 +5,29 @@ import torch
 import soundfile as sf
 import numpy as np
 import os
 # Model and Tokenizer Loading
 MODEL_ID = "Qwen/Qwen-Audio-Chat"
-# Qwen chat template
-QWEN_CHAT_TEMPLATE = """{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] }}{% endif %}{{ eos_token }}{% endfor %}"""
 def load_model():
     print("Loading model and tokenizer...")
@@ -21,22 +38,22 @@ def load_model():
         trust_remote_code=True
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-    tokenizer.chat_template = QWEN_CHAT_TEMPLATE
     print("Model and tokenizer loaded successfully")
     return model, tokenizer
 def process_audio(audio_path):
-    """Process audio file and return the appropriate format for the model."""
     try:
         print(f"Processing audio file: {audio_path}")
-        audio_data, sample_rate = sf.read(audio_path)
-        if len(audio_data.shape) > 1:
-            audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
-        print(f"Audio processed successfully. Sample rate: {sample_rate}, Shape: {audio_data.shape}")
-        return True
     except Exception as e:
         print(f"Error processing audio: {e}")
-        return False
 @spaces.GPU
 def analyze_audio(audio_path: str, question: str = None) -> str:
@@ -57,7 +74,13 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
     if not os.path.exists(audio_path):
         return f"Audio file not found: {audio_path}"
-    if not process_audio(audio_path):
         return "Failed to process the audio file. Please ensure it's a valid audio format."
     try:
@@ -68,7 +91,16 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
         messages = [
             {
                 "role": "user",
-                "content": f"Here is an audio clip: <audio>{audio_path}</audio>\n{query}"
             }
         ]
@@ -78,7 +110,7 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
             tokenize=False,
             add_generation_prompt=True
         )
-        print(f"Generated prompt text: {text[:200]}...")  # Print first 200 chars of prompt
         print("Tokenizing input...")
         model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
@@ -98,14 +130,10 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
             if outputs is None:
                 print("Model generated None output")
                 return "The model failed to generate a response. Please try again."
-            print(f"Output shape: {outputs.shape}")
-            if len(outputs.shape) != 2 or outputs.shape[0] == 0:
-                print(f"Unexpected output shape: {outputs.shape}")
-                return "The model generated an invalid response. Please try again."
             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            print(f"Generated response: {response[:200]}...")  # Print first 200 chars of response
             return response
     except Exception as e:

 import soundfile as sf
 import numpy as np
 import os
+import sys
+from pathlib import Path
 # Model and Tokenizer Loading
 MODEL_ID = "Qwen/Qwen-Audio-Chat"
+# Add the model's directory to sys.path to import its audio module
+def setup_audio_module():
+    try:
+        from huggingface_hub import snapshot_download
+        # Download the model files
+        model_path = snapshot_download(MODEL_ID)
+        if model_path not in sys.path:
+            sys.path.append(model_path)
+        # Now we can import the audio module
+        global Audio
+        from audio import Audio
+        return True
+    except Exception as e:
+        print(f"Error setting up audio module: {e}")
+        return False
 def load_model():
     print("Loading model and tokenizer...")
         trust_remote_code=True
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
     print("Model and tokenizer loaded successfully")
     return model, tokenizer
 def process_audio(audio_path):
+    """Process audio file using Qwen's audio module."""
     try:
         print(f"Processing audio file: {audio_path}")
+        # Initialize Audio processor
+        audio_processor = Audio()
+        # Process the audio file
+        processed_audio = audio_processor.load_audio_from_file(audio_path)
+        print("Audio processed successfully")
+        return processed_audio
     except Exception as e:
         print(f"Error processing audio: {e}")
+        return None
 @spaces.GPU
 def analyze_audio(audio_path: str, question: str = None) -> str:
     if not os.path.exists(audio_path):
         return f"Audio file not found: {audio_path}"
+    # Setup audio module
+    if not setup_audio_module():
+        return "Failed to initialize audio processing module."
+    # Process audio
+    processed_audio = process_audio(audio_path)
+    if processed_audio is None:
         return "Failed to process the audio file. Please ensure it's a valid audio format."
     try:
         messages = [
             {
                 "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_data": processed_audio,
+                    },
+                    {
+                        "type": "text",
+                        "text": query,
+                    },
+                ],
             }
         ]
             tokenize=False,
             add_generation_prompt=True
         )
+        print(f"Generated prompt text: {text[:200]}...")
         print("Tokenizing input...")
         model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
             if outputs is None:
                 print("Model generated None output")
                 return "The model failed to generate a response. Please try again."
+            print(f"Output shape: {outputs.shape}")
             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(f"Generated response: {response[:200]}...")
             return response
     except Exception as e: