root commited on
Commit
e3108aa
·
1 Parent(s): 801647a
Files changed (1) hide show
  1. app.py +79 -39
app.py CHANGED
@@ -38,19 +38,25 @@ SAMPLE_RATE = 22050 # Standard sample rate for audio processing
38
  # Check CUDA availability (for informational purposes)
39
  CUDA_AVAILABLE = ensure_cuda_availability()
40
 
41
- # Load models
42
- @functools.lru_cache(maxsize=1)
43
- def load_genre_model():
44
- print("Loading genre classification model...")
45
- return pipeline(
46
- "audio-classification",
47
- model=GENRE_MODEL_NAME,
48
- device=0 if CUDA_AVAILABLE else -1
49
  )
 
 
 
 
 
 
 
50
 
51
- @functools.lru_cache(maxsize=1)
52
- def load_llm_pipeline():
53
- print("Loading Qwen LLM model with 4-bit quantization...")
54
  # Configure 4-bit quantization for better performance
55
  quantization_config = BitsAndBytesConfig(
56
  load_in_4bit=True,
@@ -59,17 +65,19 @@ def load_llm_pipeline():
59
  bnb_4bit_use_double_quant=True
60
  )
61
 
62
- return pipeline(
63
- "text-generation",
64
- model=LLM_MODEL_NAME,
 
65
  device_map="auto",
66
  trust_remote_code=True,
67
- model_kwargs={
68
- "torch_dtype": torch.float16,
69
- "quantization_config": quantization_config,
70
- "use_cache": True
71
- }
72
  )
 
 
 
 
73
 
74
  # Create music analyzer instance
75
  music_analyzer = MusicAnalyzer()
@@ -95,17 +103,30 @@ def process_audio(audio_file):
95
  emotion = music_analysis["emotion_analysis"]["primary_emotion"]
96
  theme = music_analysis["theme_analysis"]["primary_theme"]
97
 
98
- # Use genre classification pipeline
99
- genre_classifier = load_genre_model()
100
-
101
- # Resample audio to 16000 Hz for the genre model
102
- y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
103
-
104
- # Classify genre
105
- genre_results = genre_classifier({"raw": y_16k, "sampling_rate": 16000})
106
-
107
- # Get top genres
108
- top_genres = [(genre["label"], genre["score"]) for genre in genre_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # Format genre results for display
111
  genre_results_text = format_genre_results(top_genres)
@@ -145,8 +166,9 @@ def generate_lyrics(music_analysis, genre, duration):
145
  emotion = music_analysis["emotion_analysis"]["primary_emotion"]
146
  theme = music_analysis["theme_analysis"]["primary_theme"]
147
 
148
- # Load LLM pipeline
149
- text_generator = load_llm_pipeline()
 
150
 
151
  # Construct prompt for the LLM
152
  prompt = f"""Write lyrics for a {genre} song with these specifications:
@@ -169,17 +191,36 @@ IMPORTANT INSTRUCTIONS:
169
  - Keep lyrics concise enough to fit the duration when sung at the given tempo
170
  """
171
 
172
- # Generate lyrics using the LLM pipeline
173
- generation_result = text_generator(
174
- prompt,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  max_new_tokens=1024,
176
  do_sample=True,
177
  temperature=0.7,
178
  top_p=0.9,
179
- return_full_text=False
 
180
  )
181
 
182
- lyrics = generation_result[0]["generated_text"]
 
 
183
 
184
  # Enhanced post-processing to remove ALL structural elements and thinking
185
  # Remove any lines with section labels using a more comprehensive pattern
@@ -262,5 +303,4 @@ if __name__ == "__main__":
262
  demo.launch()
263
  else:
264
  # For Hugging Face Spaces
265
- app = demo
266
-
 
38
  # Check CUDA availability (for informational purposes)
39
  CUDA_AVAILABLE = ensure_cuda_availability()
40
 
41
+ # Load models at initialization time
42
+ print("Loading genre classification model...")
43
+ try:
44
+ genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
45
+ genre_model = AutoModelForAudioClassification.from_pretrained(
46
+ GENRE_MODEL_NAME,
47
+ device_map="auto" if CUDA_AVAILABLE else None
 
48
  )
49
+ # Create a convenience wrapper function with the same interface as before
50
+ def get_genre_model():
51
+ return genre_model, genre_feature_extractor
52
+ except Exception as e:
53
+ print(f"Error loading genre model: {str(e)}")
54
+ genre_model = None
55
+ genre_feature_extractor = None
56
 
57
+ # Load LLM and tokenizer at initialization time
58
+ print("Loading Qwen LLM model with 4-bit quantization...")
59
+ try:
60
  # Configure 4-bit quantization for better performance
61
  quantization_config = BitsAndBytesConfig(
62
  load_in_4bit=True,
 
65
  bnb_4bit_use_double_quant=True
66
  )
67
 
68
+ llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
69
+ llm_model = AutoModelForCausalLM.from_pretrained(
70
+ LLM_MODEL_NAME,
71
+ quantization_config=quantization_config,
72
  device_map="auto",
73
  trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ use_cache=True
 
 
 
76
  )
77
+ except Exception as e:
78
+ print(f"Error loading LLM model: {str(e)}")
79
+ llm_tokenizer = None
80
+ llm_model = None
81
 
82
  # Create music analyzer instance
83
  music_analyzer = MusicAnalyzer()
 
103
  emotion = music_analysis["emotion_analysis"]["primary_emotion"]
104
  theme = music_analysis["theme_analysis"]["primary_theme"]
105
 
106
+ # Use genre classification directly instead of pipeline
107
+ if genre_model is not None and genre_feature_extractor is not None:
108
+ # Resample audio to 16000 Hz for the genre model
109
+ y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
110
+
111
+ # Extract features
112
+ inputs = genre_feature_extractor(
113
+ y_16k,
114
+ sampling_rate=16000,
115
+ return_tensors="pt"
116
+ ).to(genre_model.device)
117
+
118
+ # Classify genre
119
+ with torch.no_grad():
120
+ outputs = genre_model(**inputs)
121
+ logits = outputs.logits
122
+ probs = torch.nn.functional.softmax(logits, dim=-1)
123
+
124
+ # Get top genres
125
+ values, indices = torch.topk(probs[0], k=5)
126
+ top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)]
127
+ else:
128
+ # Fallback if model loading failed
129
+ top_genres = [("Unknown", 1.0)]
130
 
131
  # Format genre results for display
132
  genre_results_text = format_genre_results(top_genres)
 
166
  emotion = music_analysis["emotion_analysis"]["primary_emotion"]
167
  theme = music_analysis["theme_analysis"]["primary_theme"]
168
 
169
+ # Verify LLM is loaded
170
+ if llm_model is None or llm_tokenizer is None:
171
+ return "Error: LLM model not properly loaded"
172
 
173
  # Construct prompt for the LLM
174
  prompt = f"""Write lyrics for a {genre} song with these specifications:
 
191
  - Keep lyrics concise enough to fit the duration when sung at the given tempo
192
  """
193
 
194
+ # Generate lyrics using the LLM model directly
195
+ # Format as chat message
196
+ messages = [
197
+ {"role": "user", "content": prompt}
198
+ ]
199
+
200
+ # Apply chat template
201
+ text = llm_tokenizer.apply_chat_template(
202
+ messages,
203
+ tokenize=False,
204
+ add_generation_prompt=True
205
+ )
206
+
207
+ # Tokenize and move to model device
208
+ model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
209
+
210
+ # Generate with optimized parameters
211
+ generated_ids = llm_model.generate(
212
+ **model_inputs,
213
  max_new_tokens=1024,
214
  do_sample=True,
215
  temperature=0.7,
216
  top_p=0.9,
217
+ repetition_penalty=1.1,
218
+ pad_token_id=llm_tokenizer.eos_token_id
219
  )
220
 
221
+ # Decode the output
222
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
223
+ lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
224
 
225
  # Enhanced post-processing to remove ALL structural elements and thinking
226
  # Remove any lines with section labels using a more comprehensive pattern
 
303
  demo.launch()
304
  else:
305
  # For Hugging Face Spaces
306
+ app = demo