Spaces:
Running
Running
import os | |
import torch | |
import random | |
import shutil | |
import librosa | |
import warnings | |
import numpy as np | |
import gradio as gr | |
import librosa.display | |
import matplotlib.pyplot as plt | |
from utils import get_modelist, find_files, embed_img, TEMP_DIR | |
from collections import Counter | |
from model import EvalNet | |
TRANSLATE = { | |
"vibrato": "颤音", | |
"upward_portamento": "上滑音", | |
"downward_portamento": "下滑音", | |
"returning_portamento": "回滑音", | |
"glissando": "刮奏, 花指", | |
"tremolo": "摇指", | |
"harmonics": "泛音", | |
"plucks": "勾, 打, 抹, 托, ...", | |
} | |
CLASSES = list(TRANSLATE.keys()) | |
SAMPLE_RATE = 44100 | |
def circular_padding(spec: np.ndarray, end: int): | |
size = len(spec) | |
if end <= size: | |
return spec | |
num_padding = end - size | |
num_repeat = num_padding // size + int(num_padding % size != 0) | |
padding = np.tile(spec, num_repeat) | |
return np.concatenate((spec, padding))[:end] | |
def wav2mel(audio_path: str, width=3): | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
total_frames = len(y) | |
if total_frames % (width * sr) != 0: | |
count = total_frames // (width * sr) + 1 | |
y = circular_padding(y, count * width * sr) | |
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) | |
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_mel_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_mel_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def wav2cqt(audio_path: str, width=3): | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
total_frames = len(y) | |
if total_frames % (width * sr) != 0: | |
count = total_frames // (width * sr) + 1 | |
y = circular_padding(y, count * width * sr) | |
cqt_spec = librosa.cqt(y=y, sr=sr) | |
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_cqt_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_cqt_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def wav2chroma(audio_path: str, width=3): | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
try: | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
total_frames = len(y) | |
if total_frames % (width * sr) != 0: | |
count = total_frames // (width * sr) + 1 | |
y = circular_padding(y, count * width * sr) | |
chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) | |
log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) | |
dur = librosa.get_duration(y=y, sr=sr) | |
total_frames = log_chroma_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in range(begin, end, step): | |
librosa.display.specshow(log_chroma_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def most_frequent_value(lst: list): | |
counter = Counter(lst) | |
max_count = max(counter.values()) | |
for element, count in counter.items(): | |
if count == max_count: | |
return element | |
return None | |
def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR): | |
if os.path.exists(folder_path): | |
shutil.rmtree(folder_path) | |
if not wav_path: | |
return None, "请输入音频 Please input an audio!" | |
try: | |
model = EvalNet(log_name, len(TRANSLATE)).model | |
except Exception as e: | |
return None, f"{e}" | |
spec = log_name.split("_")[-3] | |
eval("wav2%s" % spec)(wav_path) | |
jpgs = find_files(folder_path, ".jpg") | |
preds = [] | |
for jpg in jpgs: | |
input = embed_img(jpg) | |
output: torch.Tensor = model(input) | |
preds.append(torch.max(output.data, 1)[1]) | |
pred_id = most_frequent_value(preds) | |
return ( | |
os.path.basename(wav_path), | |
f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})", | |
) | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
models = get_modelist() | |
examples = [] | |
example_wavs = find_files() | |
model_num = len(models) | |
for wav in example_wavs: | |
examples.append([wav, models[random.randint(0, model_num - 1)]]) | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=infer, | |
inputs=[ | |
gr.Audio(label="上传录音 Upload a recording", type="filepath"), | |
gr.Dropdown( | |
choices=models, label="选择模型 Select a model", value=models[0] | |
), | |
], | |
outputs=[ | |
gr.Textbox(label="音频文件名 Audio filename", show_copy_button=True), | |
gr.Textbox( | |
label="古筝演奏技法识别 Guzheng playing tech recognition", | |
show_copy_button=True, | |
), | |
], | |
examples=examples, | |
cache_examples=False, | |
flagging_mode="never", | |
title="建议录音时长保持在 3s 左右<br>It is recommended to keep the recording length around 3s.", | |
) | |
gr.Markdown( | |
""" | |
# 引用 Cite | |
```bibtex | |
@dataset{zhaorui_liu_2021_5676893, | |
author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
month = {mar}, | |
year = {2024}, | |
publisher = {HuggingFace}, | |
version = {1.2}, | |
url = {https://huggingface.co/ccmusic-database} | |
} | |
```""" | |
) | |
demo.launch() | |