Spaces:
Running
Running
Initial commit
Browse files- app.py +304 -0
- audio/.DS_Store +0 -0
- audio/KEI_EF04_EN038.wav +0 -0
- audio/KEI_EF05_EN038.wav +0 -0
- audio/KEI_EF07_EN038.wav +0 -0
- audio/KEI_EF08_EN038.wav +0 -0
- audio/KEI_EF09_EN038.wav +0 -0
- audio/KEI_EM01_EN038.wav +0 -0
- audio/KEI_EM02_EN038.wav +0 -0
- audio/KEI_EM03_EN038.wav +0 -0
- audio/KEI_EM05_EN038.wav +0 -0
- audio/KEI_EM06_EN038.wav +0 -0
- audio/KEI_KF01_EN038.wav +0 -0
- audio/KEI_KF03_EN038.wav +0 -0
- audio/KEI_KF04_EN038.wav +0 -0
- audio/KEI_KF05_EN038.wav +0 -0
- audio/KEI_KF06_EN038.wav +0 -0
- audio/KEI_KM01_EN038.wav +0 -0
- audio/KEI_KM03_EN038.wav +0 -0
- audio/KEI_KM04_EN038.wav +0 -0
- audio/KEI_KM05_EN038.wav +0 -0
- audio/KEI_KM06_EN038.wav +0 -0
- requirements.txt +79 -0
app.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.manifold import TSNE
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from transformers import HubertModel
|
8 |
+
import torchaudio
|
9 |
+
|
10 |
+
from scipy.stats import zscore
|
11 |
+
from librosa.sequence import dtw as lib_dtw
|
12 |
+
import plotly.express as px
|
13 |
+
import plotly.graph_objs as go
|
14 |
+
import plotly.offline as pyo
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
tsne_1 = 'tsne-3d-one'
|
19 |
+
tsne_2 = 'tsne-3d-two'
|
20 |
+
tsne_3 = 'tsne-3d-thr'
|
21 |
+
|
22 |
+
|
23 |
+
def mut_normalize_sequences(sq1, sq2, normalize: bool):
|
24 |
+
"""
|
25 |
+
Normalize the sequences together by z-scoring each dimension.
|
26 |
+
sq1: numpy array of shape (t1, d)
|
27 |
+
sq2: numpy array of shape (t2, d)
|
28 |
+
normalize: if True, normalize the sequences together
|
29 |
+
"""
|
30 |
+
if normalize:
|
31 |
+
sq1 = np.copy(sq1)
|
32 |
+
sq2 = np.copy(sq2)
|
33 |
+
len_sq1 = sq1.shape[0]
|
34 |
+
|
35 |
+
arr = np.concatenate((sq1, sq2), axis=0)
|
36 |
+
for dim in range(sq1.shape[1]):
|
37 |
+
arr[:, dim] = zscore(arr[:, dim])
|
38 |
+
sq1 = arr[:len_sq1, :]
|
39 |
+
sq2 = arr[len_sq1:, :]
|
40 |
+
return sq1, sq2
|
41 |
+
|
42 |
+
|
43 |
+
def librosa_dtw(sq1, sq2):
|
44 |
+
"""
|
45 |
+
Compute the Dynamic Time Warping distance between two sequences.
|
46 |
+
sq1: numpy array of shape (t1, d)
|
47 |
+
sq2: numpy array of shape (t2, d)
|
48 |
+
"""
|
49 |
+
return lib_dtw(sq1.transpose(), sq2.transpose())[0][-1, -1]
|
50 |
+
|
51 |
+
|
52 |
+
def time_txt(time, time_frame=5):
|
53 |
+
if time % time_frame == 0:
|
54 |
+
return f"{round(time * 0.02, 2)}"
|
55 |
+
return ""
|
56 |
+
|
57 |
+
|
58 |
+
def create_df(feats, speaker_len, names):
|
59 |
+
cols = [f"val {i}" for i in range(feats.shape[1])]
|
60 |
+
df = pd.DataFrame(feats, columns=cols)
|
61 |
+
df['idx'] = df.index
|
62 |
+
time_index = {i: speaker_len[i] for i in range(len(speaker_len))}
|
63 |
+
com_time_index = {i: sum(speaker_len[:i]) for i in range(len(speaker_len))}
|
64 |
+
df_speaker_count = pd.Series(time_index)
|
65 |
+
df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis(
|
66 |
+
'speaker_id').reset_index()
|
67 |
+
df['speaker_id'] = df_speaker_count['speaker_id']
|
68 |
+
df['speaker_len'] = df['speaker_id'].apply(lambda row: speaker_len[row])
|
69 |
+
df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
|
70 |
+
df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
|
71 |
+
df['time'] = df['idx'] - df['com_sum']
|
72 |
+
df['time_txt'] = df[['time', 'speaker_len']].apply(lambda row: time_txt(row['time'], time_frame), axis=1)
|
73 |
+
assert len(df.loc[df['speaker'] == -1]) == 0
|
74 |
+
assert len(df_speaker_count) == len(df)
|
75 |
+
df_subset = df.copy()
|
76 |
+
data_subset = df_subset[cols].values
|
77 |
+
return data_subset, df_subset, cols
|
78 |
+
|
79 |
+
|
80 |
+
def tsne(data_subset, init='pca', early_exaggeration=12.0, lr='auto', n_comp=3, perplexity=40, iters=1000,
|
81 |
+
random_state=None):
|
82 |
+
tsne = TSNE(n_components=n_comp, verbose=1, perplexity=perplexity, n_iter=iters, init=init,
|
83 |
+
early_exaggeration=early_exaggeration,
|
84 |
+
learning_rate=lr, random_state=random_state)
|
85 |
+
tsne_results = tsne.fit_transform(data_subset)
|
86 |
+
return tsne_results
|
87 |
+
|
88 |
+
|
89 |
+
def fill_tsne(df_subset, tsne_results):
|
90 |
+
print(tsne_results[:, 0].shape)
|
91 |
+
df_subset[tsne_1] = tsne_results[:, 0]
|
92 |
+
df_subset[tsne_2] = tsne_results[:, 1]
|
93 |
+
if tsne_results.shape[1] == 3:
|
94 |
+
df_subset[tsne_3] = tsne_results[:, 2]
|
95 |
+
return df_subset
|
96 |
+
|
97 |
+
|
98 |
+
def plot_tsne(df_subset):
|
99 |
+
#pyo.init_notebook_mode()
|
100 |
+
fig = px.scatter_3d(df_subset, x=tsne_1, y=tsne_2, z=tsne_3,
|
101 |
+
color='speaker')
|
102 |
+
fig.update_traces(mode='lines+markers+text')
|
103 |
+
#pyo.iplot(fig, filename='jupyter-styled_bar')
|
104 |
+
fig.write_html(f"tsne_plot_all.html")
|
105 |
+
|
106 |
+
|
107 |
+
def calc_distance(df_subset, speaker1, speaker2, cols):
|
108 |
+
features_speaker1 = df_subset[df_subset['speaker'] == speaker1][cols].to_numpy()
|
109 |
+
features_speaker2 = df_subset[df_subset['speaker'] == speaker2][cols].to_numpy()
|
110 |
+
features_speaker1, features_speaker2 = mut_normalize_sequences(features_speaker1, features_speaker2, True)
|
111 |
+
distance = librosa_dtw(features_speaker1, features_speaker2)
|
112 |
+
distance = distance / (len(features_speaker1) + len(features_speaker2))
|
113 |
+
return distance
|
114 |
+
|
115 |
+
|
116 |
+
def plot_two_speakers(speaker1, speaker2, max_s1=None, max_s2=None, df_subset=None, speakerLabel="audio1"):
|
117 |
+
def axes_style3d(bgcolor = "rgb(20, 20, 20)", gridcolor="rgb(255, 255, 255)"):
|
118 |
+
return dict(showbackground =True, backgroundcolor=bgcolor, gridcolor=gridcolor, zeroline=False)
|
119 |
+
dcp = df_subset.loc[df_subset['speaker'].isin([speaker1, speaker2])].copy().rename(
|
120 |
+
columns={tsne_1: "x", tsne_2: 'y', tsne_3: 'z'})
|
121 |
+
dcp1 = dcp.loc[(dcp['speaker'] == speaker1)].copy()
|
122 |
+
dcp2 = dcp.loc[(dcp['speaker'] == speaker2)].copy()
|
123 |
+
dcp1['clr'] = np.linspace(0, 1, dcp.loc[(dcp['speaker'] == speaker1)].shape[0])
|
124 |
+
dcp2['clr'] = np.linspace(1, 0, dcp.loc[(dcp['speaker'] == speaker2)].shape[0])
|
125 |
+
|
126 |
+
if max_s1 is not None:
|
127 |
+
dcp1 = dcp1[:max_s1]
|
128 |
+
|
129 |
+
if max_s2 is not None:
|
130 |
+
dcp2 = dcp2[:max_s2]
|
131 |
+
# S1
|
132 |
+
fig = px.scatter_3d(dcp1, x='x', y='y', z='z',
|
133 |
+
color='clr', symbol='speaker',
|
134 |
+
text='time_txt',
|
135 |
+
labels={'x': 't-SNE-dim1', 'y': 't-SNE-dim2', 'z': 't-SNE-dim3'})
|
136 |
+
fig.update_traces({'name':speakerLabel}, hovertemplate="audio1", marker_symbol='diamond', marker_coloraxis=None, marker_colorscale='burg',
|
137 |
+
mode='lines+markers+text', line_color='lightgray')
|
138 |
+
fig.for_each_trace(lambda t: t.update(textfont_color='darkred'))
|
139 |
+
|
140 |
+
# S2
|
141 |
+
fig2 = px.scatter_3d(dcp2, x='x', y='y', z='z',
|
142 |
+
color='clr', symbol='speaker',
|
143 |
+
text='time_txt',
|
144 |
+
labels={'x': 't-SNE-dim1', 'y': 't-SNE-dim2', 'z': 't-SNE-dim3'})
|
145 |
+
fig2.update_traces({'name':'baseline'}, hovertemplate="EF08", marker_coloraxis=None, marker_colorscale='ice', mode='lines+markers+text', line_color='lightgray')
|
146 |
+
fig2.for_each_trace(lambda t: t.update(textfont_color='blue'))
|
147 |
+
|
148 |
+
axis_style = axes_style3d(bgcolor='rgb(245, 249, 252)',) #transparent background color
|
149 |
+
fig3 = go.Figure(data=fig.data + fig2.data)
|
150 |
+
fig3.update_layout(scene=dict(
|
151 |
+
xaxis = axis_style,
|
152 |
+
yaxis = axis_style,
|
153 |
+
zaxis = axis_style,
|
154 |
+
xaxis_title='dimension 1 (t-SNE)',
|
155 |
+
yaxis_title='dimension 2 (t-SNE)',
|
156 |
+
zaxis_title='dimension 3 (t-SNE)'),
|
157 |
+
margin=dict(r=20, b=10, l=10, t=10),
|
158 |
+
legend_title="Speaker", )
|
159 |
+
|
160 |
+
# fig3.show()
|
161 |
+
#fig3.write_html(f"tsne_plot.html")
|
162 |
+
return fig3
|
163 |
+
|
164 |
+
# Model's label rate is 0.02 seconds. To not overflow the plot, time is shown every 5 samples (0.1 seconds).
|
165 |
+
# To change that, change "time_frame" below.
|
166 |
+
|
167 |
+
time_frame = 5
|
168 |
+
|
169 |
+
def grPlot(wav_paths, speakerLabel):
|
170 |
+
seed = 31415
|
171 |
+
# Load wav files
|
172 |
+
expected_sr = 16000
|
173 |
+
wavs = []
|
174 |
+
for wav_path in wav_paths:
|
175 |
+
wav, sr = torchaudio.load(wav_path)
|
176 |
+
if sr != expected_sr:
|
177 |
+
print(f"Sampling rate of {wav_path} is not {expected_sr} -> Resampling the file")
|
178 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=expected_sr)
|
179 |
+
wav = resampler(wav)
|
180 |
+
wav.squeeze()
|
181 |
+
wavs.append(wav)
|
182 |
+
|
183 |
+
# Generate Features
|
184 |
+
device_name = "cuda" if torch.cuda.is_available() else "cpu"
|
185 |
+
device = torch.device(device_name)
|
186 |
+
print(f'Running on {device_name}')
|
187 |
+
|
188 |
+
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
|
189 |
+
features = None
|
190 |
+
speaker_len = []
|
191 |
+
layer = 12
|
192 |
+
names = [f.rsplit(".", 1)[0] for f in wav_paths]
|
193 |
+
# Not batched to know the actual seqence shape
|
194 |
+
for wav in wavs:
|
195 |
+
wav_features = model(wav, return_dict=True, output_hidden_states=True).hidden_states[
|
196 |
+
layer].squeeze().detach().numpy()
|
197 |
+
features = wav_features if features is None else np.concatenate([features, wav_features], axis=0)
|
198 |
+
speaker_len.append(wav_features.shape[0])
|
199 |
+
|
200 |
+
# Create & Fill a dataframe with the details
|
201 |
+
data_subset, df_subset, hubert_feature_columns = create_df(features, speaker_len, names)
|
202 |
+
|
203 |
+
df_subset_orig = df_subset.copy()
|
204 |
+
data_subset_orig = data_subset.copy()
|
205 |
+
|
206 |
+
|
207 |
+
#iters set to 300
|
208 |
+
tsne_results = tsne(data_subset, init='pca', early_exaggeration=2.0, lr=100.0, n_comp=3, perplexity=40, iters=300,
|
209 |
+
random_state=seed)
|
210 |
+
df_subset = fill_tsne(df_subset, tsne_results)
|
211 |
+
|
212 |
+
# Evaluate Distance of Two Speakers which will be the first two speakers in the wav_paths array
|
213 |
+
S1 = names[0]
|
214 |
+
S2 = names[1]
|
215 |
+
|
216 |
+
# FULL DIMENSIONALITY
|
217 |
+
distance = calc_distance(df_subset, S1, S2, hubert_feature_columns)
|
218 |
+
print(f"Full Dim. Distance: {distance}")
|
219 |
+
|
220 |
+
# TSNE DIMENSIONALITY
|
221 |
+
cols = [tsne_1, tsne_2, tsne_3]
|
222 |
+
distance = calc_distance(df_subset, S1, S2, cols)
|
223 |
+
print(f"TSNE Dim. Distance: {distance}")
|
224 |
+
|
225 |
+
# TSNE plot of the 2 speakers with no min and no max parameters
|
226 |
+
fig = plot_two_speakers(S1, S2, None, None, df_subset, speakerLabel)
|
227 |
+
|
228 |
+
return [fig, distance]
|
229 |
+
|
230 |
+
|
231 |
+
def grAudioInputs(audio1, audio2):
|
232 |
+
wav_paths = [
|
233 |
+
audio1,
|
234 |
+
"audio/KEI_EF08_EN038.wav",
|
235 |
+
"audio/KEI_KF04_EN038.wav",
|
236 |
+
"audio/KEI_EF04_EN038.wav",
|
237 |
+
"audio/KEI_EF05_EN038.wav",
|
238 |
+
"audio/KEI_EF07_EN038.wav",
|
239 |
+
"audio/KEI_EF09_EN038.wav",
|
240 |
+
"audio/KEI_EM01_EN038.wav",
|
241 |
+
"audio/KEI_EM02_EN038.wav",
|
242 |
+
"audio/KEI_EM03_EN038.wav",
|
243 |
+
"audio/KEI_EM05_EN038.wav",
|
244 |
+
"audio/KEI_EM06_EN038.wav",
|
245 |
+
"audio/KEI_KF01_EN038.wav",
|
246 |
+
"audio/KEI_KF03_EN038.wav",
|
247 |
+
"audio/KEI_KF05_EN038.wav",
|
248 |
+
"audio/KEI_KF06_EN038.wav",
|
249 |
+
"audio/KEI_KM01_EN038.wav",
|
250 |
+
"audio/KEI_KM03_EN038.wav",
|
251 |
+
"audio/KEI_KM04_EN038.wav",
|
252 |
+
"audio/KEI_KM05_EN038.wav",
|
253 |
+
"audio/KEI_KM06_EN038.wav"]
|
254 |
+
|
255 |
+
#user uploaded audio1 with EF_EN038 as baseline
|
256 |
+
grFig1, distance1 = grPlot(wav_paths, "audio1")
|
257 |
+
|
258 |
+
wav_paths = [
|
259 |
+
audio2,
|
260 |
+
"audio/KEI_EF08_EN038.wav",
|
261 |
+
"audio/KEI_KF04_EN038.wav",
|
262 |
+
"audio/KEI_EF04_EN038.wav",
|
263 |
+
"audio/KEI_EF05_EN038.wav",
|
264 |
+
"audio/KEI_EF07_EN038.wav",
|
265 |
+
"audio/KEI_EF09_EN038.wav",
|
266 |
+
"audio/KEI_EM01_EN038.wav",
|
267 |
+
"audio/KEI_EM02_EN038.wav",
|
268 |
+
"audio/KEI_EM03_EN038.wav",
|
269 |
+
"audio/KEI_EM05_EN038.wav",
|
270 |
+
"audio/KEI_EM06_EN038.wav",
|
271 |
+
"audio/KEI_KF01_EN038.wav",
|
272 |
+
"audio/KEI_KF03_EN038.wav",
|
273 |
+
"audio/KEI_KF05_EN038.wav",
|
274 |
+
"audio/KEI_KF06_EN038.wav",
|
275 |
+
"audio/KEI_KM01_EN038.wav",
|
276 |
+
"audio/KEI_KM03_EN038.wav",
|
277 |
+
"audio/KEI_KM04_EN038.wav",
|
278 |
+
"audio/KEI_KM05_EN038.wav",
|
279 |
+
"audio/KEI_KM06_EN038.wav"]
|
280 |
+
|
281 |
+
#user uploaded audio2 with EF_EN038 as baseline
|
282 |
+
grFig2, distance2 = grPlot(wav_paths, "audio2")
|
283 |
+
|
284 |
+
mdText = "Note: Darkness indicates time in recording (light = start; dark = end)"
|
285 |
+
|
286 |
+
return [grFig1, mdText, distance1, grFig2, mdText, distance2]
|
287 |
+
|
288 |
+
|
289 |
+
demo = gr.Interface(
|
290 |
+
grAudioInputs,
|
291 |
+
[gr.Audio(sources=["microphone", "upload"], type="filepath"), gr.Audio(sources=["microphone", "upload"], type="filepath")],
|
292 |
+
[gr.Plot(label="Perceptual Similarity Space Audio 1", min_width=300, show_label=False, container=True),
|
293 |
+
gr.Markdown(),
|
294 |
+
gr.Textbox(label="t-SNE Distance"),
|
295 |
+
gr.Plot(label="Perceptual Similarity Space Audio 2", min_width=300, show_label=False, container=True),
|
296 |
+
gr.Markdown(),
|
297 |
+
gr.Textbox(label="t-SNE Distance")],
|
298 |
+
title="Perceptual Similarity Space Demo",
|
299 |
+
description="<p>Demo based on Chernyak, Bradlow, Keshet, & Goldrick (2024) \"A perceptual space for speech based on self-supervised speech representations\" <a href='https://doi.org/10.1121/10.0026358' target='_blank'>https://doi.org/10.1121/10.0026358</a></p> <p>This demo allows you to upload two recordings of the sentence \"The lady wore a coat\" for comparison against a common baseline – an English L1 production of that sentence. The output will be two visualizations, each showing a 3-dimensional projection of one of your recording vs. the baseline, along with the perceptual similarity space distance of that recording from the baseline.</p> <p>For example, you could compare a relatively high intelligibility L2 English talker’s production of \"The lady wore a coat\" to the same sentence as produced by a relatively low intelligibility L2 English talker. Based on our results, you’d expect the high intelligibility talker to typically have a smaller distance to the baseline relative to the low intelligibility talker.</p> <p><b>Requirements:</b> <ul><li>The two recordings must be of the sentence \"The lady wore a coat\" (the distance comparisons are not interpretable if different sentences are used).</li><li>Each must be a mono .wav file.</li><li>Any leading/trailing silence should be trimmed.</li></ul></p><p>Download example files here: <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_KF03_EN038.wav' download>high_intelligibility_talker</a>, <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_KF01_EN038.wav' download>low_intelligibility_talker</a>. For reference, the baseline speaker can be downloaded <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_EF08_EN038.wav' download>here</a>.",
|
300 |
+
article="This work was supported by NSF Grant No. 2219843 and BSF Grant No. 2022618.",
|
301 |
+
flagging_mode="never",
|
302 |
+
)
|
303 |
+
|
304 |
+
demo.launch()
|
audio/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
audio/KEI_EF04_EN038.wav
ADDED
Binary file (101 kB). View file
|
|
audio/KEI_EF05_EN038.wav
ADDED
Binary file (98.2 kB). View file
|
|
audio/KEI_EF07_EN038.wav
ADDED
Binary file (106 kB). View file
|
|
audio/KEI_EF08_EN038.wav
ADDED
Binary file (114 kB). View file
|
|
audio/KEI_EF09_EN038.wav
ADDED
Binary file (95.6 kB). View file
|
|
audio/KEI_EM01_EN038.wav
ADDED
Binary file (105 kB). View file
|
|
audio/KEI_EM02_EN038.wav
ADDED
Binary file (99.2 kB). View file
|
|
audio/KEI_EM03_EN038.wav
ADDED
Binary file (96.9 kB). View file
|
|
audio/KEI_EM05_EN038.wav
ADDED
Binary file (94.6 kB). View file
|
|
audio/KEI_EM06_EN038.wav
ADDED
Binary file (90.2 kB). View file
|
|
audio/KEI_KF01_EN038.wav
ADDED
Binary file (165 kB). View file
|
|
audio/KEI_KF03_EN038.wav
ADDED
Binary file (154 kB). View file
|
|
audio/KEI_KF04_EN038.wav
ADDED
Binary file (154 kB). View file
|
|
audio/KEI_KF05_EN038.wav
ADDED
Binary file (161 kB). View file
|
|
audio/KEI_KF06_EN038.wav
ADDED
Binary file (142 kB). View file
|
|
audio/KEI_KM01_EN038.wav
ADDED
Binary file (133 kB). View file
|
|
audio/KEI_KM03_EN038.wav
ADDED
Binary file (152 kB). View file
|
|
audio/KEI_KM04_EN038.wav
ADDED
Binary file (116 kB). View file
|
|
audio/KEI_KM05_EN038.wav
ADDED
Binary file (143 kB). View file
|
|
audio/KEI_KM06_EN038.wav
ADDED
Binary file (147 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.6.2.post1
|
4 |
+
audioread==3.0.1
|
5 |
+
certifi==2024.8.30
|
6 |
+
cffi==1.17.1
|
7 |
+
charset-normalizer==3.4.0
|
8 |
+
click==8.1.7
|
9 |
+
decorator==5.1.1
|
10 |
+
fastapi==0.115.4
|
11 |
+
ffmpy==0.4.0
|
12 |
+
filelock==3.16.1
|
13 |
+
fsspec==2024.10.0
|
14 |
+
gradio==5.5.0
|
15 |
+
gradio_client==1.4.2
|
16 |
+
h11==0.14.0
|
17 |
+
httpcore==1.0.6
|
18 |
+
httpx==0.27.2
|
19 |
+
huggingface-hub==0.26.2
|
20 |
+
idna==3.10
|
21 |
+
Jinja2==3.1.4
|
22 |
+
joblib==1.4.2
|
23 |
+
lazy_loader==0.4
|
24 |
+
librosa==0.10.2.post1
|
25 |
+
llvmlite==0.43.0
|
26 |
+
markdown-it-py==3.0.0
|
27 |
+
MarkupSafe==2.1.5
|
28 |
+
mdurl==0.1.2
|
29 |
+
mpmath==1.3.0
|
30 |
+
msgpack==1.1.0
|
31 |
+
networkx==3.4.2
|
32 |
+
numba==0.60.0
|
33 |
+
numpy==1.26.4
|
34 |
+
orjson==3.10.11
|
35 |
+
packaging==24.2
|
36 |
+
pandas==2.2.3
|
37 |
+
pillow==11.0.0
|
38 |
+
platformdirs==4.3.6
|
39 |
+
plotly==5.24.1
|
40 |
+
pooch==1.8.2
|
41 |
+
pycparser==2.22
|
42 |
+
pydantic==2.9.2
|
43 |
+
pydantic_core==2.23.4
|
44 |
+
pydub==0.25.1
|
45 |
+
Pygments==2.18.0
|
46 |
+
python-dateutil==2.9.0.post0
|
47 |
+
python-multipart==0.0.12
|
48 |
+
pytz==2024.2
|
49 |
+
PyYAML==6.0.2
|
50 |
+
regex==2024.11.6
|
51 |
+
requests==2.32.3
|
52 |
+
rich==13.9.4
|
53 |
+
ruff==0.7.3
|
54 |
+
safehttpx==0.1.1
|
55 |
+
safetensors==0.4.5
|
56 |
+
scikit-learn==1.5.2
|
57 |
+
scipy==1.14.1
|
58 |
+
semantic-version==2.10.0
|
59 |
+
shellingham==1.5.4
|
60 |
+
six==1.16.0
|
61 |
+
sniffio==1.3.1
|
62 |
+
soundfile==0.12.1
|
63 |
+
soxr==0.5.0.post1
|
64 |
+
starlette==0.41.2
|
65 |
+
sympy==1.13.3
|
66 |
+
tenacity==9.0.0
|
67 |
+
threadpoolctl==3.5.0
|
68 |
+
tokenizers==0.20.3
|
69 |
+
tomlkit==0.12.0
|
70 |
+
torch==2.2.2
|
71 |
+
torchaudio==2.2.2
|
72 |
+
tqdm==4.67.0
|
73 |
+
transformers==4.46.2
|
74 |
+
typer==0.13.0
|
75 |
+
typing_extensions==4.12.2
|
76 |
+
tzdata==2024.2
|
77 |
+
urllib3==2.2.3
|
78 |
+
uvicorn==0.32.0
|
79 |
+
websockets==12.0
|