Spaces:
Running
Running
import os | |
import sys | |
import torch | |
import random | |
import shutil | |
import librosa | |
import warnings | |
import subprocess | |
import numpy as np | |
import gradio as gr | |
import librosa.display | |
import matplotlib.pyplot as plt | |
import torchvision.transforms as transforms | |
from utils import get_modelist, find_mp3_files, download | |
from collections import Counter | |
from model import EvalNet | |
from PIL import Image | |
TRANSLATE = { | |
"Symphony": "交响乐 Symphony", | |
"Opera": "戏曲 Opera", | |
"Solo": "独奏 Solo", | |
"Chamber": "室内乐 Chamber", | |
"Pop_vocal_ballad": "芭乐 Pop vocal ballad", | |
"Adult_contemporary": "成人时代 Adult contemporary", | |
"Teen_pop": "青少年流行 Teen pop", | |
"Contemporary_dance_pop": "当代流行舞曲 Contemporary dance pop", | |
"Dance_pop": "流行舞曲 Dance pop", | |
"Classic_indie_pop": "经典独立流行 Classic indie pop", | |
"Chamber_cabaret_and_art_pop": "室内卡巴莱与艺术流行乐 Chamber cabaret & art pop", | |
"Soul_or_r_and_b": "灵魂乐或节奏布鲁斯 Soul / R&B", | |
"Adult_alternative_rock": "成人另类摇滚 Adult alternative rock", | |
"Uplifting_anthemic_rock": "迷幻民族摇滚 Uplifting anthemic rock", | |
"Soft_rock": "慢摇滚 Soft rock", | |
"Acoustic_pop": "原声流行 Acoustic pop", | |
} | |
CLASSES = list(TRANSLATE.keys()) | |
def most_common_element(input_list): | |
counter = Counter(input_list) | |
mce, _ = counter.most_common(1)[0] | |
return mce | |
def mp3_to_mel(audio_path: str, width=11.4): | |
os.makedirs("./flagged", exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path) | |
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) | |
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_mel_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_mel_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"./flagged/mel_{round(dur, 2)}_{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def mp3_to_cqt(audio_path: str, width=11.4): | |
os.makedirs("./flagged", exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path) | |
cqt_spec = librosa.cqt(y=y, sr=sr) | |
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_cqt_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_cqt_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"./flagged/cqt_{round(dur, 2)}_{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def mp3_to_chroma(audio_path: str, width=11.4): | |
os.makedirs("./flagged", exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path) | |
chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) | |
log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_chroma_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_chroma_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"./flagged/chroma_{round(dur, 2)}_{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def embed_img(img_path, input_size=224): | |
transform = transforms.Compose( | |
[ | |
transforms.Resize([input_size, input_size]), | |
transforms.ToTensor(), | |
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), | |
] | |
) | |
img = Image.open(img_path).convert("RGB") | |
return transform(img).unsqueeze(0) | |
def inference(mp3_path, log_name: str, folder_path="./flagged"): | |
if os.path.exists(folder_path): | |
shutil.rmtree(folder_path) | |
if not mp3_path: | |
return None, "请输入音频 Please input an audio!" | |
network = EvalNet(log_name) | |
spec = log_name.split("_")[-1] | |
eval("mp3_to_%s" % spec)(mp3_path) | |
outputs = [] | |
all_files = os.listdir(folder_path) | |
for file_name in all_files: | |
if file_name.lower().endswith(".jpg"): | |
file_path = os.path.join(folder_path, file_name) | |
input = embed_img(file_path) | |
output: torch.Tensor = network.model(input) | |
pred_id = torch.max(output.data, 1)[1] | |
outputs.append(int(pred_id)) | |
max_count_item = most_common_element(outputs) | |
shutil.rmtree(folder_path) | |
return os.path.basename(mp3_path), TRANSLATE[CLASSES[max_count_item]] | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
ffmpeg = "ffmpeg-release-amd64-static" | |
if sys.platform.startswith("linux"): | |
if not os.path.exists(f"./{ffmpeg}.tar.xz"): | |
download( | |
f"https://www.modelscope.cn/studio/ccmusic-database/music_genre/resolve/master/{ffmpeg}.tar.xz" | |
) | |
folder_path = f"{os.getcwd()}/{ffmpeg}" | |
if not os.path.exists(folder_path): | |
subprocess.call(f"tar -xvf {ffmpeg}.tar.xz", shell=True) | |
os.environ["PATH"] = f"{folder_path}:{os.environ.get('PATH', '')}" | |
models = get_modelist() | |
examples = [] | |
example_mp3s = find_mp3_files() | |
model_num = len(models) | |
for mp3 in example_mp3s: | |
examples.append([mp3, models[random.randint(0, model_num - 1)]]) | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=inference, | |
inputs=[ | |
gr.Audio(label="上传MP3音频 Upload MP3", type="filepath"), | |
gr.Dropdown( | |
choices=models, label="选择模型 Select a model", value=models[6] | |
), | |
], | |
outputs=[ | |
gr.Textbox(label="音频文件名 Audio filename", show_copy_button=True), | |
gr.Textbox(label="流派识别 Genre recognition", show_copy_button=True), | |
], | |
examples=examples, | |
cache_examples=False, | |
allow_flagging="never", | |
title="建议录音时长保持在 15s 以内, 过长会影响识别效率<br>It is recommended to keep the duration of recording within 15s, too long will affect the recognition efficiency.", | |
) | |
gr.Markdown( | |
""" | |
# 引用 Cite | |
```bibtex | |
@dataset{zhaorui_liu_2021_5676893, | |
author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
title = {CCMusic: an Open and Diverse Database for Chinese and General Music Information Retrieval Research}, | |
month = {mar}, | |
year = {2024}, | |
publisher = {HuggingFace}, | |
version = {1.2}, | |
url = {https://huggingface.co/ccmusic-database} | |
} | |
```""" | |
) | |
demo.launch() | |