desiree commited on
Commit
fff885e
·
verified ·
1 Parent(s): 56709e2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -19
app.py CHANGED
@@ -5,12 +5,29 @@ import torch
5
  import soundfile as sf
6
  import numpy as np
7
  import os
 
 
8
 
9
  # Model and Tokenizer Loading
10
  MODEL_ID = "Qwen/Qwen-Audio-Chat"
11
 
12
- # Qwen chat template
13
- QWEN_CHAT_TEMPLATE = """{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] }}{% endif %}{{ eos_token }}{% endfor %}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def load_model():
16
  print("Loading model and tokenizer...")
@@ -21,22 +38,22 @@ def load_model():
21
  trust_remote_code=True
22
  )
23
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
24
- tokenizer.chat_template = QWEN_CHAT_TEMPLATE
25
  print("Model and tokenizer loaded successfully")
26
  return model, tokenizer
27
 
28
  def process_audio(audio_path):
29
- """Process audio file and return the appropriate format for the model."""
30
  try:
31
  print(f"Processing audio file: {audio_path}")
32
- audio_data, sample_rate = sf.read(audio_path)
33
- if len(audio_data.shape) > 1:
34
- audio_data = audio_data.mean(axis=1) # Convert stereo to mono if necessary
35
- print(f"Audio processed successfully. Sample rate: {sample_rate}, Shape: {audio_data.shape}")
36
- return True
 
37
  except Exception as e:
38
  print(f"Error processing audio: {e}")
39
- return False
40
 
41
  @spaces.GPU
42
  def analyze_audio(audio_path: str, question: str = None) -> str:
@@ -57,7 +74,13 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
57
  if not os.path.exists(audio_path):
58
  return f"Audio file not found: {audio_path}"
59
 
60
- if not process_audio(audio_path):
 
 
 
 
 
 
61
  return "Failed to process the audio file. Please ensure it's a valid audio format."
62
 
63
  try:
@@ -68,7 +91,16 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
68
  messages = [
69
  {
70
  "role": "user",
71
- "content": f"Here is an audio clip: <audio>{audio_path}</audio>\n{query}"
 
 
 
 
 
 
 
 
 
72
  }
73
  ]
74
 
@@ -78,7 +110,7 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
78
  tokenize=False,
79
  add_generation_prompt=True
80
  )
81
- print(f"Generated prompt text: {text[:200]}...") # Print first 200 chars of prompt
82
 
83
  print("Tokenizing input...")
84
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
@@ -98,14 +130,10 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
98
  if outputs is None:
99
  print("Model generated None output")
100
  return "The model failed to generate a response. Please try again."
101
-
102
- print(f"Output shape: {outputs.shape}")
103
- if len(outputs.shape) != 2 or outputs.shape[0] == 0:
104
- print(f"Unexpected output shape: {outputs.shape}")
105
- return "The model generated an invalid response. Please try again."
106
 
 
107
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
108
- print(f"Generated response: {response[:200]}...") # Print first 200 chars of response
109
  return response
110
 
111
  except Exception as e:
 
5
  import soundfile as sf
6
  import numpy as np
7
  import os
8
+ import sys
9
+ from pathlib import Path
10
 
11
  # Model and Tokenizer Loading
12
  MODEL_ID = "Qwen/Qwen-Audio-Chat"
13
 
14
+ # Add the model's directory to sys.path to import its audio module
15
+ def setup_audio_module():
16
+ try:
17
+ from huggingface_hub import snapshot_download
18
+
19
+ # Download the model files
20
+ model_path = snapshot_download(MODEL_ID)
21
+ if model_path not in sys.path:
22
+ sys.path.append(model_path)
23
+
24
+ # Now we can import the audio module
25
+ global Audio
26
+ from audio import Audio
27
+ return True
28
+ except Exception as e:
29
+ print(f"Error setting up audio module: {e}")
30
+ return False
31
 
32
  def load_model():
33
  print("Loading model and tokenizer...")
 
38
  trust_remote_code=True
39
  )
40
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
41
  print("Model and tokenizer loaded successfully")
42
  return model, tokenizer
43
 
44
  def process_audio(audio_path):
45
+ """Process audio file using Qwen's audio module."""
46
  try:
47
  print(f"Processing audio file: {audio_path}")
48
+ # Initialize Audio processor
49
+ audio_processor = Audio()
50
+ # Process the audio file
51
+ processed_audio = audio_processor.load_audio_from_file(audio_path)
52
+ print("Audio processed successfully")
53
+ return processed_audio
54
  except Exception as e:
55
  print(f"Error processing audio: {e}")
56
+ return None
57
 
58
  @spaces.GPU
59
  def analyze_audio(audio_path: str, question: str = None) -> str:
 
74
  if not os.path.exists(audio_path):
75
  return f"Audio file not found: {audio_path}"
76
 
77
+ # Setup audio module
78
+ if not setup_audio_module():
79
+ return "Failed to initialize audio processing module."
80
+
81
+ # Process audio
82
+ processed_audio = process_audio(audio_path)
83
+ if processed_audio is None:
84
  return "Failed to process the audio file. Please ensure it's a valid audio format."
85
 
86
  try:
 
91
  messages = [
92
  {
93
  "role": "user",
94
+ "content": [
95
+ {
96
+ "type": "audio",
97
+ "audio_data": processed_audio,
98
+ },
99
+ {
100
+ "type": "text",
101
+ "text": query,
102
+ },
103
+ ],
104
  }
105
  ]
106
 
 
110
  tokenize=False,
111
  add_generation_prompt=True
112
  )
113
+ print(f"Generated prompt text: {text[:200]}...")
114
 
115
  print("Tokenizing input...")
116
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 
130
  if outputs is None:
131
  print("Model generated None output")
132
  return "The model failed to generate a response. Please try again."
 
 
 
 
 
133
 
134
+ print(f"Output shape: {outputs.shape}")
135
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
136
+ print(f"Generated response: {response[:200]}...")
137
  return response
138
 
139
  except Exception as e: