MLSpeech commited on
Commit
5ab83b1
·
verified ·
1 Parent(s): 840fab2

Initial commit

Browse files
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.manifold import TSNE
4
+
5
+ import torch
6
+
7
+ from transformers import HubertModel
8
+ import torchaudio
9
+
10
+ from scipy.stats import zscore
11
+ from librosa.sequence import dtw as lib_dtw
12
+ import plotly.express as px
13
+ import plotly.graph_objs as go
14
+ import plotly.offline as pyo
15
+
16
+ import gradio as gr
17
+
18
+ tsne_1 = 'tsne-3d-one'
19
+ tsne_2 = 'tsne-3d-two'
20
+ tsne_3 = 'tsne-3d-thr'
21
+
22
+
23
+ def mut_normalize_sequences(sq1, sq2, normalize: bool):
24
+ """
25
+ Normalize the sequences together by z-scoring each dimension.
26
+ sq1: numpy array of shape (t1, d)
27
+ sq2: numpy array of shape (t2, d)
28
+ normalize: if True, normalize the sequences together
29
+ """
30
+ if normalize:
31
+ sq1 = np.copy(sq1)
32
+ sq2 = np.copy(sq2)
33
+ len_sq1 = sq1.shape[0]
34
+
35
+ arr = np.concatenate((sq1, sq2), axis=0)
36
+ for dim in range(sq1.shape[1]):
37
+ arr[:, dim] = zscore(arr[:, dim])
38
+ sq1 = arr[:len_sq1, :]
39
+ sq2 = arr[len_sq1:, :]
40
+ return sq1, sq2
41
+
42
+
43
+ def librosa_dtw(sq1, sq2):
44
+ """
45
+ Compute the Dynamic Time Warping distance between two sequences.
46
+ sq1: numpy array of shape (t1, d)
47
+ sq2: numpy array of shape (t2, d)
48
+ """
49
+ return lib_dtw(sq1.transpose(), sq2.transpose())[0][-1, -1]
50
+
51
+
52
+ def time_txt(time, time_frame=5):
53
+ if time % time_frame == 0:
54
+ return f"{round(time * 0.02, 2)}"
55
+ return ""
56
+
57
+
58
+ def create_df(feats, speaker_len, names):
59
+ cols = [f"val {i}" for i in range(feats.shape[1])]
60
+ df = pd.DataFrame(feats, columns=cols)
61
+ df['idx'] = df.index
62
+ time_index = {i: speaker_len[i] for i in range(len(speaker_len))}
63
+ com_time_index = {i: sum(speaker_len[:i]) for i in range(len(speaker_len))}
64
+ df_speaker_count = pd.Series(time_index)
65
+ df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis(
66
+ 'speaker_id').reset_index()
67
+ df['speaker_id'] = df_speaker_count['speaker_id']
68
+ df['speaker_len'] = df['speaker_id'].apply(lambda row: speaker_len[row])
69
+ df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
70
+ df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
71
+ df['time'] = df['idx'] - df['com_sum']
72
+ df['time_txt'] = df[['time', 'speaker_len']].apply(lambda row: time_txt(row['time'], time_frame), axis=1)
73
+ assert len(df.loc[df['speaker'] == -1]) == 0
74
+ assert len(df_speaker_count) == len(df)
75
+ df_subset = df.copy()
76
+ data_subset = df_subset[cols].values
77
+ return data_subset, df_subset, cols
78
+
79
+
80
+ def tsne(data_subset, init='pca', early_exaggeration=12.0, lr='auto', n_comp=3, perplexity=40, iters=1000,
81
+ random_state=None):
82
+ tsne = TSNE(n_components=n_comp, verbose=1, perplexity=perplexity, n_iter=iters, init=init,
83
+ early_exaggeration=early_exaggeration,
84
+ learning_rate=lr, random_state=random_state)
85
+ tsne_results = tsne.fit_transform(data_subset)
86
+ return tsne_results
87
+
88
+
89
+ def fill_tsne(df_subset, tsne_results):
90
+ print(tsne_results[:, 0].shape)
91
+ df_subset[tsne_1] = tsne_results[:, 0]
92
+ df_subset[tsne_2] = tsne_results[:, 1]
93
+ if tsne_results.shape[1] == 3:
94
+ df_subset[tsne_3] = tsne_results[:, 2]
95
+ return df_subset
96
+
97
+
98
+ def plot_tsne(df_subset):
99
+ #pyo.init_notebook_mode()
100
+ fig = px.scatter_3d(df_subset, x=tsne_1, y=tsne_2, z=tsne_3,
101
+ color='speaker')
102
+ fig.update_traces(mode='lines+markers+text')
103
+ #pyo.iplot(fig, filename='jupyter-styled_bar')
104
+ fig.write_html(f"tsne_plot_all.html")
105
+
106
+
107
+ def calc_distance(df_subset, speaker1, speaker2, cols):
108
+ features_speaker1 = df_subset[df_subset['speaker'] == speaker1][cols].to_numpy()
109
+ features_speaker2 = df_subset[df_subset['speaker'] == speaker2][cols].to_numpy()
110
+ features_speaker1, features_speaker2 = mut_normalize_sequences(features_speaker1, features_speaker2, True)
111
+ distance = librosa_dtw(features_speaker1, features_speaker2)
112
+ distance = distance / (len(features_speaker1) + len(features_speaker2))
113
+ return distance
114
+
115
+
116
+ def plot_two_speakers(speaker1, speaker2, max_s1=None, max_s2=None, df_subset=None, speakerLabel="audio1"):
117
+ def axes_style3d(bgcolor = "rgb(20, 20, 20)", gridcolor="rgb(255, 255, 255)"):
118
+ return dict(showbackground =True, backgroundcolor=bgcolor, gridcolor=gridcolor, zeroline=False)
119
+ dcp = df_subset.loc[df_subset['speaker'].isin([speaker1, speaker2])].copy().rename(
120
+ columns={tsne_1: "x", tsne_2: 'y', tsne_3: 'z'})
121
+ dcp1 = dcp.loc[(dcp['speaker'] == speaker1)].copy()
122
+ dcp2 = dcp.loc[(dcp['speaker'] == speaker2)].copy()
123
+ dcp1['clr'] = np.linspace(0, 1, dcp.loc[(dcp['speaker'] == speaker1)].shape[0])
124
+ dcp2['clr'] = np.linspace(1, 0, dcp.loc[(dcp['speaker'] == speaker2)].shape[0])
125
+
126
+ if max_s1 is not None:
127
+ dcp1 = dcp1[:max_s1]
128
+
129
+ if max_s2 is not None:
130
+ dcp2 = dcp2[:max_s2]
131
+ # S1
132
+ fig = px.scatter_3d(dcp1, x='x', y='y', z='z',
133
+ color='clr', symbol='speaker',
134
+ text='time_txt',
135
+ labels={'x': 't-SNE-dim1', 'y': 't-SNE-dim2', 'z': 't-SNE-dim3'})
136
+ fig.update_traces({'name':speakerLabel}, hovertemplate="audio1", marker_symbol='diamond', marker_coloraxis=None, marker_colorscale='burg',
137
+ mode='lines+markers+text', line_color='lightgray')
138
+ fig.for_each_trace(lambda t: t.update(textfont_color='darkred'))
139
+
140
+ # S2
141
+ fig2 = px.scatter_3d(dcp2, x='x', y='y', z='z',
142
+ color='clr', symbol='speaker',
143
+ text='time_txt',
144
+ labels={'x': 't-SNE-dim1', 'y': 't-SNE-dim2', 'z': 't-SNE-dim3'})
145
+ fig2.update_traces({'name':'baseline'}, hovertemplate="EF08", marker_coloraxis=None, marker_colorscale='ice', mode='lines+markers+text', line_color='lightgray')
146
+ fig2.for_each_trace(lambda t: t.update(textfont_color='blue'))
147
+
148
+ axis_style = axes_style3d(bgcolor='rgb(245, 249, 252)',) #transparent background color
149
+ fig3 = go.Figure(data=fig.data + fig2.data)
150
+ fig3.update_layout(scene=dict(
151
+ xaxis = axis_style,
152
+ yaxis = axis_style,
153
+ zaxis = axis_style,
154
+ xaxis_title='dimension 1 (t-SNE)',
155
+ yaxis_title='dimension 2 (t-SNE)',
156
+ zaxis_title='dimension 3 (t-SNE)'),
157
+ margin=dict(r=20, b=10, l=10, t=10),
158
+ legend_title="Speaker", )
159
+
160
+ # fig3.show()
161
+ #fig3.write_html(f"tsne_plot.html")
162
+ return fig3
163
+
164
+ # Model's label rate is 0.02 seconds. To not overflow the plot, time is shown every 5 samples (0.1 seconds).
165
+ # To change that, change "time_frame" below.
166
+
167
+ time_frame = 5
168
+
169
+ def grPlot(wav_paths, speakerLabel):
170
+ seed = 31415
171
+ # Load wav files
172
+ expected_sr = 16000
173
+ wavs = []
174
+ for wav_path in wav_paths:
175
+ wav, sr = torchaudio.load(wav_path)
176
+ if sr != expected_sr:
177
+ print(f"Sampling rate of {wav_path} is not {expected_sr} -> Resampling the file")
178
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=expected_sr)
179
+ wav = resampler(wav)
180
+ wav.squeeze()
181
+ wavs.append(wav)
182
+
183
+ # Generate Features
184
+ device_name = "cuda" if torch.cuda.is_available() else "cpu"
185
+ device = torch.device(device_name)
186
+ print(f'Running on {device_name}')
187
+
188
+ model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
189
+ features = None
190
+ speaker_len = []
191
+ layer = 12
192
+ names = [f.rsplit(".", 1)[0] for f in wav_paths]
193
+ # Not batched to know the actual seqence shape
194
+ for wav in wavs:
195
+ wav_features = model(wav, return_dict=True, output_hidden_states=True).hidden_states[
196
+ layer].squeeze().detach().numpy()
197
+ features = wav_features if features is None else np.concatenate([features, wav_features], axis=0)
198
+ speaker_len.append(wav_features.shape[0])
199
+
200
+ # Create & Fill a dataframe with the details
201
+ data_subset, df_subset, hubert_feature_columns = create_df(features, speaker_len, names)
202
+
203
+ df_subset_orig = df_subset.copy()
204
+ data_subset_orig = data_subset.copy()
205
+
206
+
207
+ #iters set to 300
208
+ tsne_results = tsne(data_subset, init='pca', early_exaggeration=2.0, lr=100.0, n_comp=3, perplexity=40, iters=300,
209
+ random_state=seed)
210
+ df_subset = fill_tsne(df_subset, tsne_results)
211
+
212
+ # Evaluate Distance of Two Speakers which will be the first two speakers in the wav_paths array
213
+ S1 = names[0]
214
+ S2 = names[1]
215
+
216
+ # FULL DIMENSIONALITY
217
+ distance = calc_distance(df_subset, S1, S2, hubert_feature_columns)
218
+ print(f"Full Dim. Distance: {distance}")
219
+
220
+ # TSNE DIMENSIONALITY
221
+ cols = [tsne_1, tsne_2, tsne_3]
222
+ distance = calc_distance(df_subset, S1, S2, cols)
223
+ print(f"TSNE Dim. Distance: {distance}")
224
+
225
+ # TSNE plot of the 2 speakers with no min and no max parameters
226
+ fig = plot_two_speakers(S1, S2, None, None, df_subset, speakerLabel)
227
+
228
+ return [fig, distance]
229
+
230
+
231
+ def grAudioInputs(audio1, audio2):
232
+ wav_paths = [
233
+ audio1,
234
+ "audio/KEI_EF08_EN038.wav",
235
+ "audio/KEI_KF04_EN038.wav",
236
+ "audio/KEI_EF04_EN038.wav",
237
+ "audio/KEI_EF05_EN038.wav",
238
+ "audio/KEI_EF07_EN038.wav",
239
+ "audio/KEI_EF09_EN038.wav",
240
+ "audio/KEI_EM01_EN038.wav",
241
+ "audio/KEI_EM02_EN038.wav",
242
+ "audio/KEI_EM03_EN038.wav",
243
+ "audio/KEI_EM05_EN038.wav",
244
+ "audio/KEI_EM06_EN038.wav",
245
+ "audio/KEI_KF01_EN038.wav",
246
+ "audio/KEI_KF03_EN038.wav",
247
+ "audio/KEI_KF05_EN038.wav",
248
+ "audio/KEI_KF06_EN038.wav",
249
+ "audio/KEI_KM01_EN038.wav",
250
+ "audio/KEI_KM03_EN038.wav",
251
+ "audio/KEI_KM04_EN038.wav",
252
+ "audio/KEI_KM05_EN038.wav",
253
+ "audio/KEI_KM06_EN038.wav"]
254
+
255
+ #user uploaded audio1 with EF_EN038 as baseline
256
+ grFig1, distance1 = grPlot(wav_paths, "audio1")
257
+
258
+ wav_paths = [
259
+ audio2,
260
+ "audio/KEI_EF08_EN038.wav",
261
+ "audio/KEI_KF04_EN038.wav",
262
+ "audio/KEI_EF04_EN038.wav",
263
+ "audio/KEI_EF05_EN038.wav",
264
+ "audio/KEI_EF07_EN038.wav",
265
+ "audio/KEI_EF09_EN038.wav",
266
+ "audio/KEI_EM01_EN038.wav",
267
+ "audio/KEI_EM02_EN038.wav",
268
+ "audio/KEI_EM03_EN038.wav",
269
+ "audio/KEI_EM05_EN038.wav",
270
+ "audio/KEI_EM06_EN038.wav",
271
+ "audio/KEI_KF01_EN038.wav",
272
+ "audio/KEI_KF03_EN038.wav",
273
+ "audio/KEI_KF05_EN038.wav",
274
+ "audio/KEI_KF06_EN038.wav",
275
+ "audio/KEI_KM01_EN038.wav",
276
+ "audio/KEI_KM03_EN038.wav",
277
+ "audio/KEI_KM04_EN038.wav",
278
+ "audio/KEI_KM05_EN038.wav",
279
+ "audio/KEI_KM06_EN038.wav"]
280
+
281
+ #user uploaded audio2 with EF_EN038 as baseline
282
+ grFig2, distance2 = grPlot(wav_paths, "audio2")
283
+
284
+ mdText = "Note: Darkness indicates time in recording (light = start; dark = end)"
285
+
286
+ return [grFig1, mdText, distance1, grFig2, mdText, distance2]
287
+
288
+
289
+ demo = gr.Interface(
290
+ grAudioInputs,
291
+ [gr.Audio(sources=["microphone", "upload"], type="filepath"), gr.Audio(sources=["microphone", "upload"], type="filepath")],
292
+ [gr.Plot(label="Perceptual Similarity Space Audio 1", min_width=300, show_label=False, container=True),
293
+ gr.Markdown(),
294
+ gr.Textbox(label="t-SNE Distance"),
295
+ gr.Plot(label="Perceptual Similarity Space Audio 2", min_width=300, show_label=False, container=True),
296
+ gr.Markdown(),
297
+ gr.Textbox(label="t-SNE Distance")],
298
+ title="Perceptual Similarity Space Demo",
299
+ description="<p>Demo based on Chernyak, Bradlow, Keshet, & Goldrick (2024) \"A perceptual space for speech based on self-supervised speech representations\" <a href='https://doi.org/10.1121/10.0026358' target='_blank'>https://doi.org/10.1121/10.0026358</a></p> <p>This demo allows you to upload two recordings of the sentence \"The lady wore a coat\" for comparison against a common baseline – an English L1 production of that sentence. The output will be two visualizations, each showing a 3-dimensional projection of one of your recording vs. the baseline, along with the perceptual similarity space distance of that recording from the baseline.</p> <p>For example, you could compare a relatively high intelligibility L2 English talker’s production of \"The lady wore a coat\" to the same sentence as produced by a relatively low intelligibility L2 English talker. Based on our results, you’d expect the high intelligibility talker to typically have a smaller distance to the baseline relative to the low intelligibility talker.</p> <p><b>Requirements:</b> <ul><li>The two recordings must be of the sentence \"The lady wore a coat\" (the distance comparisons are not interpretable if different sentences are used).</li><li>Each must be a mono .wav file.</li><li>Any leading/trailing silence should be trimmed.</li></ul></p><p>Download example files here: <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_KF03_EN038.wav' download>high_intelligibility_talker</a>, <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_KF01_EN038.wav' download>low_intelligibility_talker</a>. For reference, the baseline speaker can be downloaded <a href='https://huggingface.co/spaces/MLSpeech/perceptual-similiarity/resolve/main/audio/KEI_EF08_EN038.wav' download>here</a>.",
300
+ article="This work was supported by NSF Grant No. 2219843 and BSF Grant No. 2022618.",
301
+ flagging_mode="never",
302
+ )
303
+
304
+ demo.launch()
audio/.DS_Store ADDED
Binary file (6.15 kB). View file
 
audio/KEI_EF04_EN038.wav ADDED
Binary file (101 kB). View file
 
audio/KEI_EF05_EN038.wav ADDED
Binary file (98.2 kB). View file
 
audio/KEI_EF07_EN038.wav ADDED
Binary file (106 kB). View file
 
audio/KEI_EF08_EN038.wav ADDED
Binary file (114 kB). View file
 
audio/KEI_EF09_EN038.wav ADDED
Binary file (95.6 kB). View file
 
audio/KEI_EM01_EN038.wav ADDED
Binary file (105 kB). View file
 
audio/KEI_EM02_EN038.wav ADDED
Binary file (99.2 kB). View file
 
audio/KEI_EM03_EN038.wav ADDED
Binary file (96.9 kB). View file
 
audio/KEI_EM05_EN038.wav ADDED
Binary file (94.6 kB). View file
 
audio/KEI_EM06_EN038.wav ADDED
Binary file (90.2 kB). View file
 
audio/KEI_KF01_EN038.wav ADDED
Binary file (165 kB). View file
 
audio/KEI_KF03_EN038.wav ADDED
Binary file (154 kB). View file
 
audio/KEI_KF04_EN038.wav ADDED
Binary file (154 kB). View file
 
audio/KEI_KF05_EN038.wav ADDED
Binary file (161 kB). View file
 
audio/KEI_KF06_EN038.wav ADDED
Binary file (142 kB). View file
 
audio/KEI_KM01_EN038.wav ADDED
Binary file (133 kB). View file
 
audio/KEI_KM03_EN038.wav ADDED
Binary file (152 kB). View file
 
audio/KEI_KM04_EN038.wav ADDED
Binary file (116 kB). View file
 
audio/KEI_KM05_EN038.wav ADDED
Binary file (143 kB). View file
 
audio/KEI_KM06_EN038.wav ADDED
Binary file (147 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.6.2.post1
4
+ audioread==3.0.1
5
+ certifi==2024.8.30
6
+ cffi==1.17.1
7
+ charset-normalizer==3.4.0
8
+ click==8.1.7
9
+ decorator==5.1.1
10
+ fastapi==0.115.4
11
+ ffmpy==0.4.0
12
+ filelock==3.16.1
13
+ fsspec==2024.10.0
14
+ gradio==5.5.0
15
+ gradio_client==1.4.2
16
+ h11==0.14.0
17
+ httpcore==1.0.6
18
+ httpx==0.27.2
19
+ huggingface-hub==0.26.2
20
+ idna==3.10
21
+ Jinja2==3.1.4
22
+ joblib==1.4.2
23
+ lazy_loader==0.4
24
+ librosa==0.10.2.post1
25
+ llvmlite==0.43.0
26
+ markdown-it-py==3.0.0
27
+ MarkupSafe==2.1.5
28
+ mdurl==0.1.2
29
+ mpmath==1.3.0
30
+ msgpack==1.1.0
31
+ networkx==3.4.2
32
+ numba==0.60.0
33
+ numpy==1.26.4
34
+ orjson==3.10.11
35
+ packaging==24.2
36
+ pandas==2.2.3
37
+ pillow==11.0.0
38
+ platformdirs==4.3.6
39
+ plotly==5.24.1
40
+ pooch==1.8.2
41
+ pycparser==2.22
42
+ pydantic==2.9.2
43
+ pydantic_core==2.23.4
44
+ pydub==0.25.1
45
+ Pygments==2.18.0
46
+ python-dateutil==2.9.0.post0
47
+ python-multipart==0.0.12
48
+ pytz==2024.2
49
+ PyYAML==6.0.2
50
+ regex==2024.11.6
51
+ requests==2.32.3
52
+ rich==13.9.4
53
+ ruff==0.7.3
54
+ safehttpx==0.1.1
55
+ safetensors==0.4.5
56
+ scikit-learn==1.5.2
57
+ scipy==1.14.1
58
+ semantic-version==2.10.0
59
+ shellingham==1.5.4
60
+ six==1.16.0
61
+ sniffio==1.3.1
62
+ soundfile==0.12.1
63
+ soxr==0.5.0.post1
64
+ starlette==0.41.2
65
+ sympy==1.13.3
66
+ tenacity==9.0.0
67
+ threadpoolctl==3.5.0
68
+ tokenizers==0.20.3
69
+ tomlkit==0.12.0
70
+ torch==2.2.2
71
+ torchaudio==2.2.2
72
+ tqdm==4.67.0
73
+ transformers==4.46.2
74
+ typer==0.13.0
75
+ typing_extensions==4.12.2
76
+ tzdata==2024.2
77
+ urllib3==2.2.3
78
+ uvicorn==0.32.0
79
+ websockets==12.0