MLSpeech commited on
Commit
2d2c8d9
·
verified ·
1 Parent(s): a3afafe

Modify app to include csv map file to constrain pairwise comparisons

Browse files
Files changed (1) hide show
  1. app.py +42 -8
app.py CHANGED
@@ -10,7 +10,6 @@ import spaces
10
  from itertools import combinations
11
  import os
12
 
13
-
14
  def mut_normalize_sequences(sq1, sq2, normalize: bool):
15
  """
16
  Normalize the sequences together by z-scoring each dimension.
@@ -83,7 +82,11 @@ def calc_distance(df_subset, speaker1, speaker2, cols):
83
  time_frame = 5
84
 
85
  # @spaces.GPU(duration=120)
86
- def grMeasureDistance(wav_paths):
 
 
 
 
87
  if wav_paths is None:
88
  gr.Warning("Please upload some sound files!")
89
  return None
@@ -122,15 +125,33 @@ def grMeasureDistance(wav_paths):
122
 
123
  # Evaluate Distance of all speaker pairs
124
  distances_list = []
125
- wav_pairs = list(combinations(names, 2))
 
 
 
 
 
 
 
 
 
 
126
  for wav_pair in wav_pairs:
127
  S1 = wav_pair[0]
128
  S2 = wav_pair[1]
 
 
129
  # FULL DIMENSIONALITY
130
  distance = calc_distance(df_subset, S1, S2, hubert_feature_columns)
131
  distances_list.append([os.path.basename(S1), os.path.basename(S2), distance])
132
  return distances_list
133
 
 
 
 
 
 
 
134
  #csv export function
135
  def export_csv(d):
136
  if(len(d.iloc[0,0])>0):
@@ -142,6 +163,7 @@ def clearInterface():
142
 
143
  #main GradIO interface
144
  with gr.Blocks() as demo:
 
145
  gr.Markdown(
146
  """
147
  # PS3-PDM: Perceptual Similarity Space for Speech-Pairwise Distance Matrix
@@ -150,7 +172,7 @@ with gr.Blocks() as demo:
150
  - Supported by the National Science Foundation (DRL 2219843) and Binational Science Foundation (2022618)
151
 
152
  ## Description
153
- Takes a set of utterance files (.wav format) and generates all pair-wise distances of the corresponding trajectories in HuBERT embedding spaces. Methods are based on Kim et al. (2025) and Chernyak et al. (2024). We report distances for embeddings in the original embedding space of transformer layer 12, without any form of dimensionality reduction.
154
  """)
155
 
156
  with gr.Accordion("Click for more details", open=False):
@@ -166,14 +188,25 @@ with gr.Blocks() as demo:
166
 
167
  ## Requirements
168
  - All speech files must be in .wav format. (Note: It is recommended to normalize the loudness of the files.)
169
- - All speech files must contain productions of the identical linguistic content (i.e., same words in same order).
170
  - For example, the files may contain productions of a given sentence by different talkers, or by a single talker under different conditions.
171
  - Note that while the utility will return distance values for files with different content the interpretation of these values is meaningless.
172
 
173
  ## Usage
174
  - Upload wav files.
 
 
 
 
 
 
 
 
 
 
 
175
  - Click 'run' to get distances.
176
- - Output (download in .csv format) consists of a table with 3 columns (sentence1, sentence2, distance) and (n*(n-1))/2 rows where n = number of sentences (.wav soundfiles).
177
 
178
  ## Capacity limits
179
  - Processing time is approximately 7 times the duration of the input audio files. For example, a minute of audio can take up to 7 minutes to process. If processing is taking longer than expected, please refresh the page and reupload your files.
@@ -188,14 +221,15 @@ with gr.Blocks() as demo:
188
  """
189
  )
190
  with gr.Row():
191
- inputFiles = gr.File(file_count="multiple", file_types=[".wav"])
 
192
  with gr.Column():
193
  runbtn = gr.Button("Run")
194
  csv = gr.File(interactive=False, visible=False)
195
  dataframe = gr.Dataframe(headers=["S1", "S2", "distance"], visible=True, row_count=[1, 'dynamic'])
196
 
197
 
198
- runbtn.click(fn=grMeasureDistance, inputs=inputFiles, outputs=dataframe)
199
  dataframe.change(export_csv, inputs=dataframe, outputs=csv)
200
 
201
  inputFiles.change(fn=clearInterface, inputs=None, outputs=[csv, dataframe])
 
10
  from itertools import combinations
11
  import os
12
 
 
13
  def mut_normalize_sequences(sq1, sq2, normalize: bool):
14
  """
15
  Normalize the sequences together by z-scoring each dimension.
 
82
  time_frame = 5
83
 
84
  # @spaces.GPU(duration=120)
85
+ def grMeasureDistance(wav_paths, map_file):
86
+ map_df = pd.read_csv(map_file)
87
+ #for index, row in map_df.iterrows():
88
+ # gr.Info(row['File1'].astype(str))
89
+
90
  if wav_paths is None:
91
  gr.Warning("Please upload some sound files!")
92
  return None
 
125
 
126
  # Evaluate Distance of all speaker pairs
127
  distances_list = []
128
+ #wav_pairs = list(combinations(names, 2))
129
+ wav_pairs = []
130
+
131
+ for index, row in map_df.iterrows():
132
+ file1_index = find_substring_index(names, row['S1'])
133
+ file2_index = find_substring_index(names, row['S2'])
134
+
135
+ if(file1_index != -1 and file2_index != -1):
136
+ wav_pairs.append((names[file1_index], names[file2_index]))
137
+
138
+ #print(len(wav_pairs))
139
  for wav_pair in wav_pairs:
140
  S1 = wav_pair[0]
141
  S2 = wav_pair[1]
142
+ #print("*** " + S1 + " *** " + S2 + " ***")
143
+
144
  # FULL DIMENSIONALITY
145
  distance = calc_distance(df_subset, S1, S2, hubert_feature_columns)
146
  distances_list.append([os.path.basename(S1), os.path.basename(S2), distance])
147
  return distances_list
148
 
149
+ def find_substring_index(string_list, substring):
150
+ for index, string in enumerate(string_list):
151
+ if substring in string:
152
+ return index
153
+ return -1
154
+
155
  #csv export function
156
  def export_csv(d):
157
  if(len(d.iloc[0,0])>0):
 
163
 
164
  #main GradIO interface
165
  with gr.Blocks() as demo:
166
+ gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"])
167
  gr.Markdown(
168
  """
169
  # PS3-PDM: Perceptual Similarity Space for Speech-Pairwise Distance Matrix
 
172
  - Supported by the National Science Foundation (DRL 2219843) and Binational Science Foundation (2022618)
173
 
174
  ## Description
175
+ Takes a set of utterance files (.wav format) and a two column .csv *map file*. Generates pair-wise distances of the corresponding trajectories in HuBERT embedding spaces. Methods are based on Kim et al. (2025) and Chernyak et al. (2024). We report distances for embeddings in the original embedding space of transformer layer 12, without any form of dimensionality reduction.
176
  """)
177
 
178
  with gr.Accordion("Click for more details", open=False):
 
188
 
189
  ## Requirements
190
  - All speech files must be in .wav format. (Note: It is recommended to normalize the loudness of the files.)
191
+ - All speech files that are being compared must contain productions of the identical linguistic content (i.e., same words in same order).
192
  - For example, the files may contain productions of a given sentence by different talkers, or by a single talker under different conditions.
193
  - Note that while the utility will return distance values for files with different content the interpretation of these values is meaningless.
194
 
195
  ## Usage
196
  - Upload wav files.
197
+ - Upload csv *map file* that contains two columns with the headers "S1" and "S2".
198
+
199
+ | S1 | S2 |
200
+ | --------------- | --------------- |
201
+ | my_sentence_1_1 | my_sentence_1_2 |
202
+ | my_sentence_2_1 | my_sentence_2_2 |
203
+ | etc... | etc... |
204
+
205
+ - Example csv map file available [here](https://huggingface.co/spaces/MLSpeech/perceptual-similarity/resolve/main/example.csv)
206
+ - Each cell should contain the name of a wav file that was uploaded **without the ".wav" extension**
207
+ - Distances will be measured by comparing the files in the "S1" column to the files in the "S2" column
208
  - Click 'run' to get distances.
209
+ - Output (download in .csv format) consists of a table with 4 columns (index, S1, S2, distance)
210
 
211
  ## Capacity limits
212
  - Processing time is approximately 7 times the duration of the input audio files. For example, a minute of audio can take up to 7 minutes to process. If processing is taking longer than expected, please refresh the page and reupload your files.
 
221
  """
222
  )
223
  with gr.Row():
224
+ inputFiles = gr.File(label="wav files", file_count="multiple", file_types=[".wav"])
225
+ mapFile = gr.File(label="map file", file_count="single", file_types=[".csv", ".txt"])
226
  with gr.Column():
227
  runbtn = gr.Button("Run")
228
  csv = gr.File(interactive=False, visible=False)
229
  dataframe = gr.Dataframe(headers=["S1", "S2", "distance"], visible=True, row_count=[1, 'dynamic'])
230
 
231
 
232
+ runbtn.click(fn=grMeasureDistance, inputs=[inputFiles, mapFile], outputs=dataframe)
233
  dataframe.change(export_csv, inputs=dataframe, outputs=csv)
234
 
235
  inputFiles.change(fn=clearInterface, inputs=None, outputs=[csv, dataframe])