Spaces:
Running
Running
import os | |
import torch | |
import librosa | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
import librosa.display | |
from model import EvalNet, t_EvalNet | |
from utils import get_modelist, find_files, embed, MODEL_DIR | |
TRANSLATE = { | |
"chanyin": "Vibrato", # 颤音 | |
"boxian": "Plucks", # 拨弦 | |
"shanghua": "Upward Portamento", # 上滑音 | |
"xiahua": "Downward Portamento", # 下滑音 | |
"huazhi/guazou/lianmo/liantuo": "Glissando", # 花指\刮奏\连抹\连托 | |
"yaozhi": "Tremolo", # 摇指 | |
"dianyin": "Point Note", # 点音 | |
} | |
CLASSES = list(TRANSLATE.keys()) | |
TEMP_DIR = "./__pycache__/tmp" | |
SAMPLE_RATE = 44100 | |
HOP_LENGTH = 512 | |
TIME_LENGTH = 3 | |
def logMel(y, sr=SAMPLE_RATE): | |
mel = librosa.feature.melspectrogram( | |
y=y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
fmin=27.5, | |
) | |
return librosa.power_to_db(mel, ref=np.max) | |
def logCqt(y, sr=SAMPLE_RATE): | |
cqt = librosa.cqt( | |
y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
fmin=27.5, | |
n_bins=88, | |
bins_per_octave=12, | |
) | |
return ((1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(cqt), ref=np.max)) + 1.0 | |
def logChroma(y, sr=SAMPLE_RATE): | |
chroma = librosa.feature.chroma_stft( | |
y=y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
) | |
return ( | |
(1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(chroma), ref=np.max) | |
) + 1.0 | |
def RoW_norm(data): | |
common_sum = 0 | |
square_sum = 0 | |
tfle = 0 | |
for i in range(len(data)): | |
tfle += (data[i].sum(-1).sum(0) != 0).astype("float").sum() | |
common_sum += data[i].sum(-1).sum(-1) | |
square_sum += (data[i] ** 2).sum(-1).sum(-1) | |
common_avg = common_sum / tfle | |
square_avg = square_sum / tfle | |
std = np.sqrt(square_avg - common_avg**2) | |
return common_avg, std | |
def norm(data): | |
size = data.shape | |
avg, std = RoW_norm(data) | |
avg = np.tile(avg.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
std = np.tile(std.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
return (data - avg) / std | |
def chunk_data(f): | |
x = [] | |
xdata = np.transpose(f) | |
s = SAMPLE_RATE * TIME_LENGTH // HOP_LENGTH | |
length = int(np.ceil((int(len(xdata) / s) + 1) * s)) | |
app = np.zeros((length - xdata.shape[0], xdata.shape[1])) | |
xdata = np.concatenate((xdata, app), 0) | |
for i in range(int(length / s)): | |
data = xdata[int(i * s) : int(i * s + s)] | |
x.append(np.transpose(data[:s, :])) | |
return np.array(x) | |
def load(audio_path: str, converto="mel"): | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
spec = eval("log%s(y, sr)" % converto.capitalize()) | |
x_spec = chunk_data(spec) | |
Xtr_spec = np.expand_dims(x_spec, axis=3) | |
return list(norm(Xtr_spec)) | |
def format_second(seconds): | |
integer_part = int(seconds) | |
decimal_part = round(seconds - integer_part, 3) | |
hours, remainder = divmod(integer_part, 3600) | |
minutes, seconds = divmod(remainder, 60) | |
return f"{hours:02}:{minutes:02}:{seconds:02}.{decimal_part:.3f}" | |
def infer(audio_path: str, log_name: str): | |
if not audio_path: | |
return "Please input an audio!", None | |
backbone = "_".join(log_name.split("_")[:-1]) | |
spec = log_name.split("_")[-1] | |
try: | |
input = load(audio_path, converto=spec) | |
dur = librosa.get_duration(path=audio_path) | |
frames_per_3s = input[0].shape[1] | |
if "vit" in backbone or "swin" in backbone: | |
eval_net = t_EvalNet( | |
backbone, | |
len(TRANSLATE), | |
input[0].shape[1], | |
weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
) | |
else: | |
eval_net = EvalNet( | |
backbone, | |
len(TRANSLATE), | |
input[0].shape[1], | |
weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
) | |
input_size = eval_net.get_input_size() | |
embeded_input = embed(input, input_size) | |
output = list(eval_net.forward(embeded_input)) | |
except Exception as e: | |
return f"{e}", None | |
index = 0 | |
outputs = [] | |
for y in output: | |
preds = list(y.T) | |
for pred in preds: | |
start = index * TIME_LENGTH / frames_per_3s | |
if start > dur: | |
break | |
to = (index + 1) * TIME_LENGTH / frames_per_3s | |
outputs.append( | |
{ | |
"Frame": f"{format_second(start)} - {format_second(to)}", | |
"Tech": TRANSLATE[CLASSES[torch.argmax(pred).item()]], | |
} | |
) | |
index += 1 | |
return os.path.basename(audio_path), pd.DataFrame(outputs) | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
models = get_modelist(assign_model="VGG19_mel") | |
examples = [] | |
example_wavs = find_files() | |
for wav in example_wavs: | |
examples.append([wav, models[0]]) | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=infer, | |
inputs=[ | |
gr.Audio(label="Upload audio", type="filepath"), | |
gr.Dropdown(choices=models, label="Select a model", value=models[0]), | |
], | |
outputs=[ | |
gr.Textbox(label="Audio filename", show_copy_button=True), | |
gr.Dataframe(label="Frame-level guzheng playing technique detection"), | |
], | |
examples=examples, | |
cache_examples=False, | |
flagging_mode="never", | |
title="It is suggested that the recording time should not be too long", | |
) | |
gr.Markdown( | |
""" | |
# Cite | |
```bibtex | |
@article{Zhou-2025, | |
title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
journal = {Transactions of the International Society for Music Information Retrieval}, | |
year = {2025} | |
} | |
```""" | |
) | |
demo.launch() | |