Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -5,12 +5,29 @@ import torch
|
|
5 |
import soundfile as sf
|
6 |
import numpy as np
|
7 |
import os
|
|
|
|
|
8 |
|
9 |
# Model and Tokenizer Loading
|
10 |
MODEL_ID = "Qwen/Qwen-Audio-Chat"
|
11 |
|
12 |
-
#
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def load_model():
|
16 |
print("Loading model and tokenizer...")
|
@@ -21,22 +38,22 @@ def load_model():
|
|
21 |
trust_remote_code=True
|
22 |
)
|
23 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
24 |
-
tokenizer.chat_template = QWEN_CHAT_TEMPLATE
|
25 |
print("Model and tokenizer loaded successfully")
|
26 |
return model, tokenizer
|
27 |
|
28 |
def process_audio(audio_path):
|
29 |
-
"""Process audio file
|
30 |
try:
|
31 |
print(f"Processing audio file: {audio_path}")
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
37 |
except Exception as e:
|
38 |
print(f"Error processing audio: {e}")
|
39 |
-
return
|
40 |
|
41 |
@spaces.GPU
|
42 |
def analyze_audio(audio_path: str, question: str = None) -> str:
|
@@ -57,7 +74,13 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
|
|
57 |
if not os.path.exists(audio_path):
|
58 |
return f"Audio file not found: {audio_path}"
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
return "Failed to process the audio file. Please ensure it's a valid audio format."
|
62 |
|
63 |
try:
|
@@ -68,7 +91,16 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
|
|
68 |
messages = [
|
69 |
{
|
70 |
"role": "user",
|
71 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
}
|
73 |
]
|
74 |
|
@@ -78,7 +110,7 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
|
|
78 |
tokenize=False,
|
79 |
add_generation_prompt=True
|
80 |
)
|
81 |
-
print(f"Generated prompt text: {text[:200]}...")
|
82 |
|
83 |
print("Tokenizing input...")
|
84 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
@@ -98,14 +130,10 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
|
|
98 |
if outputs is None:
|
99 |
print("Model generated None output")
|
100 |
return "The model failed to generate a response. Please try again."
|
101 |
-
|
102 |
-
print(f"Output shape: {outputs.shape}")
|
103 |
-
if len(outputs.shape) != 2 or outputs.shape[0] == 0:
|
104 |
-
print(f"Unexpected output shape: {outputs.shape}")
|
105 |
-
return "The model generated an invalid response. Please try again."
|
106 |
|
|
|
107 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
108 |
-
print(f"Generated response: {response[:200]}...")
|
109 |
return response
|
110 |
|
111 |
except Exception as e:
|
|
|
5 |
import soundfile as sf
|
6 |
import numpy as np
|
7 |
import os
|
8 |
+
import sys
|
9 |
+
from pathlib import Path
|
10 |
|
11 |
# Model and Tokenizer Loading
|
12 |
MODEL_ID = "Qwen/Qwen-Audio-Chat"
|
13 |
|
14 |
+
# Add the model's directory to sys.path to import its audio module
|
15 |
+
def setup_audio_module():
|
16 |
+
try:
|
17 |
+
from huggingface_hub import snapshot_download
|
18 |
+
|
19 |
+
# Download the model files
|
20 |
+
model_path = snapshot_download(MODEL_ID)
|
21 |
+
if model_path not in sys.path:
|
22 |
+
sys.path.append(model_path)
|
23 |
+
|
24 |
+
# Now we can import the audio module
|
25 |
+
global Audio
|
26 |
+
from audio import Audio
|
27 |
+
return True
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error setting up audio module: {e}")
|
30 |
+
return False
|
31 |
|
32 |
def load_model():
|
33 |
print("Loading model and tokenizer...")
|
|
|
38 |
trust_remote_code=True
|
39 |
)
|
40 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
|
|
41 |
print("Model and tokenizer loaded successfully")
|
42 |
return model, tokenizer
|
43 |
|
44 |
def process_audio(audio_path):
|
45 |
+
"""Process audio file using Qwen's audio module."""
|
46 |
try:
|
47 |
print(f"Processing audio file: {audio_path}")
|
48 |
+
# Initialize Audio processor
|
49 |
+
audio_processor = Audio()
|
50 |
+
# Process the audio file
|
51 |
+
processed_audio = audio_processor.load_audio_from_file(audio_path)
|
52 |
+
print("Audio processed successfully")
|
53 |
+
return processed_audio
|
54 |
except Exception as e:
|
55 |
print(f"Error processing audio: {e}")
|
56 |
+
return None
|
57 |
|
58 |
@spaces.GPU
|
59 |
def analyze_audio(audio_path: str, question: str = None) -> str:
|
|
|
74 |
if not os.path.exists(audio_path):
|
75 |
return f"Audio file not found: {audio_path}"
|
76 |
|
77 |
+
# Setup audio module
|
78 |
+
if not setup_audio_module():
|
79 |
+
return "Failed to initialize audio processing module."
|
80 |
+
|
81 |
+
# Process audio
|
82 |
+
processed_audio = process_audio(audio_path)
|
83 |
+
if processed_audio is None:
|
84 |
return "Failed to process the audio file. Please ensure it's a valid audio format."
|
85 |
|
86 |
try:
|
|
|
91 |
messages = [
|
92 |
{
|
93 |
"role": "user",
|
94 |
+
"content": [
|
95 |
+
{
|
96 |
+
"type": "audio",
|
97 |
+
"audio_data": processed_audio,
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"type": "text",
|
101 |
+
"text": query,
|
102 |
+
},
|
103 |
+
],
|
104 |
}
|
105 |
]
|
106 |
|
|
|
110 |
tokenize=False,
|
111 |
add_generation_prompt=True
|
112 |
)
|
113 |
+
print(f"Generated prompt text: {text[:200]}...")
|
114 |
|
115 |
print("Tokenizing input...")
|
116 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
|
|
130 |
if outputs is None:
|
131 |
print("Model generated None output")
|
132 |
return "The model failed to generate a response. Please try again."
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
print(f"Output shape: {outputs.shape}")
|
135 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
136 |
+
print(f"Generated response: {response[:200]}...")
|
137 |
return response
|
138 |
|
139 |
except Exception as e:
|