admin commited on
Commit
6bbbe9a
·
1 Parent(s): 7b7e565
Files changed (1) hide show
  1. app.py +76 -89
app.py CHANGED
@@ -251,102 +251,87 @@ def circular_padding(spec: np.ndarray, end: int):
251
 
252
 
253
  def wav2mel(audio_path: str, width=2, top_db=40):
254
- os.makedirs(TEMP_DIR, exist_ok=True)
255
- try:
256
- y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
257
- non_silents = librosa.effects.split(y, top_db=top_db)
258
- y = np.concatenate([y[start:end] for start, end in non_silents])
259
- total_frames = len(y)
260
- if total_frames % (width * sr) != 0:
261
- count = total_frames // (width * sr) + 1
262
- y = circular_padding(y, count * width * sr)
263
-
264
- mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
265
- log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
266
- dur = librosa.get_duration(y=y, sr=sr)
267
- total_frames = log_mel_spec.shape[1]
268
- step = int(width * total_frames / dur)
269
- count = int(total_frames / step)
270
- begin = int(0.5 * (total_frames - count * step))
271
- end = begin + step * count
272
- for i in range(begin, end, step):
273
- librosa.display.specshow(log_mel_spec[:, i : i + step])
274
- plt.axis("off")
275
- plt.savefig(
276
- f"{TEMP_DIR}/{i}.jpg",
277
- bbox_inches="tight",
278
- pad_inches=0.0,
279
- )
280
- plt.close()
281
 
282
- except Exception as e:
283
- print(f"Error converting {audio_path} : {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
 
286
  def wav2cqt(audio_path: str, width=2, top_db=40):
287
- os.makedirs(TEMP_DIR, exist_ok=True)
288
- try:
289
- y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
290
- non_silents = librosa.effects.split(y, top_db=top_db)
291
- y = np.concatenate([y[start:end] for start, end in non_silents])
292
- total_frames = len(y)
293
- if total_frames % (width * sr) != 0:
294
- count = total_frames // (width * sr) + 1
295
- y = circular_padding(y, count * width * sr)
296
-
297
- cqt_spec = librosa.cqt(y=y, sr=sr)
298
- log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
299
- dur = librosa.get_duration(y=y, sr=sr)
300
- total_frames = log_cqt_spec.shape[1]
301
- step = int(width * total_frames / dur)
302
- count = int(total_frames / step)
303
- begin = int(0.5 * (total_frames - count * step))
304
- end = begin + step * count
305
- for i in range(begin, end, step):
306
- librosa.display.specshow(log_cqt_spec[:, i : i + step])
307
- plt.axis("off")
308
- plt.savefig(
309
- f"{TEMP_DIR}/{i}.jpg",
310
- bbox_inches="tight",
311
- pad_inches=0.0,
312
- )
313
- plt.close()
314
 
315
- except Exception as e:
316
- print(f"Error converting {audio_path} : {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
 
319
  def wav2chroma(audio_path: str, width=2, top_db=40):
320
- os.makedirs(TEMP_DIR, exist_ok=True)
321
- try:
322
- y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
323
- non_silents = librosa.effects.split(y, top_db=top_db)
324
- y = np.concatenate([y[start:end] for start, end in non_silents])
325
- total_frames = len(y)
326
- if total_frames % (width * sr) != 0:
327
- count = total_frames // (width * sr) + 1
328
- y = circular_padding(y, count * width * sr)
329
-
330
- chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
331
- log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
332
- dur = librosa.get_duration(y=y, sr=sr)
333
- total_frames = log_chroma_spec.shape[1]
334
- step = int(width * total_frames / dur)
335
- count = int(total_frames / step)
336
- begin = int(0.5 * (total_frames - count * step))
337
- end = begin + step * count
338
- for i in range(begin, end, step):
339
- librosa.display.specshow(log_chroma_spec[:, i : i + step])
340
- plt.axis("off")
341
- plt.savefig(
342
- f"{TEMP_DIR}/{i}.jpg",
343
- bbox_inches="tight",
344
- pad_inches=0.0,
345
- )
346
- plt.close()
347
 
348
- except Exception as e:
349
- print(f"Error converting {audio_path} : {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
 
352
  def most_frequent_value(lst: list):
@@ -366,13 +351,15 @@ def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
366
  if not wav_path:
367
  return None, "Please input an audio!"
368
 
 
 
369
  try:
370
  model = EvalNet(log_name, len(TRANSLATE)).model
 
 
371
  except Exception as e:
372
  return None, f"{e}"
373
 
374
- spec = log_name.split("_")[-3]
375
- eval("wav2%s" % spec)(wav_path)
376
  jpgs = find_files(folder_path, ".jpg")
377
  preds = []
378
  for jpg in jpgs:
 
251
 
252
 
253
  def wav2mel(audio_path: str, width=2, top_db=40):
254
+ y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
255
+ non_silents = librosa.effects.split(y, top_db=top_db)
256
+ y = np.concatenate([y[start:end] for start, end in non_silents])
257
+ total_frames = len(y)
258
+ if total_frames % (width * sr) != 0:
259
+ count = total_frames // (width * sr) + 1
260
+ y = circular_padding(y, count * width * sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
263
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
264
+ dur = librosa.get_duration(y=y, sr=sr)
265
+ total_frames = log_mel_spec.shape[1]
266
+ step = int(width * total_frames / dur)
267
+ count = int(total_frames / step)
268
+ begin = int(0.5 * (total_frames - count * step))
269
+ end = begin + step * count
270
+ for i in range(begin, end, step):
271
+ librosa.display.specshow(log_mel_spec[:, i : i + step])
272
+ plt.axis("off")
273
+ plt.savefig(
274
+ f"{TEMP_DIR}/{i}.jpg",
275
+ bbox_inches="tight",
276
+ pad_inches=0.0,
277
+ )
278
+ plt.close()
279
 
280
 
281
  def wav2cqt(audio_path: str, width=2, top_db=40):
282
+ y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
283
+ non_silents = librosa.effects.split(y, top_db=top_db)
284
+ y = np.concatenate([y[start:end] for start, end in non_silents])
285
+ total_frames = len(y)
286
+ if total_frames % (width * sr) != 0:
287
+ count = total_frames // (width * sr) + 1
288
+ y = circular_padding(y, count * width * sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ cqt_spec = librosa.cqt(y=y, sr=sr)
291
+ log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
292
+ dur = librosa.get_duration(y=y, sr=sr)
293
+ total_frames = log_cqt_spec.shape[1]
294
+ step = int(width * total_frames / dur)
295
+ count = int(total_frames / step)
296
+ begin = int(0.5 * (total_frames - count * step))
297
+ end = begin + step * count
298
+ for i in range(begin, end, step):
299
+ librosa.display.specshow(log_cqt_spec[:, i : i + step])
300
+ plt.axis("off")
301
+ plt.savefig(
302
+ f"{TEMP_DIR}/{i}.jpg",
303
+ bbox_inches="tight",
304
+ pad_inches=0.0,
305
+ )
306
+ plt.close()
307
 
308
 
309
  def wav2chroma(audio_path: str, width=2, top_db=40):
310
+ y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
311
+ non_silents = librosa.effects.split(y, top_db=top_db)
312
+ y = np.concatenate([y[start:end] for start, end in non_silents])
313
+ total_frames = len(y)
314
+ if total_frames % (width * sr) != 0:
315
+ count = total_frames // (width * sr) + 1
316
+ y = circular_padding(y, count * width * sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
+ chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
319
+ log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
320
+ dur = librosa.get_duration(y=y, sr=sr)
321
+ total_frames = log_chroma_spec.shape[1]
322
+ step = int(width * total_frames / dur)
323
+ count = int(total_frames / step)
324
+ begin = int(0.5 * (total_frames - count * step))
325
+ end = begin + step * count
326
+ for i in range(begin, end, step):
327
+ librosa.display.specshow(log_chroma_spec[:, i : i + step])
328
+ plt.axis("off")
329
+ plt.savefig(
330
+ f"{TEMP_DIR}/{i}.jpg",
331
+ bbox_inches="tight",
332
+ pad_inches=0.0,
333
+ )
334
+ plt.close()
335
 
336
 
337
  def most_frequent_value(lst: list):
 
351
  if not wav_path:
352
  return None, "Please input an audio!"
353
 
354
+ spec = log_name.split("_")[-3]
355
+ os.makedirs(folder_path, exist_ok=True)
356
  try:
357
  model = EvalNet(log_name, len(TRANSLATE)).model
358
+ eval("wav2%s" % spec)(wav_path)
359
+
360
  except Exception as e:
361
  return None, f"{e}"
362
 
 
 
363
  jpgs = find_files(folder_path, ".jpg")
364
  preds = []
365
  for jpg in jpgs: