mohan007 commited on
Commit
aeccecc
·
verified ·
1 Parent(s): c4bdf6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -102
app.py CHANGED
@@ -155,109 +155,7 @@ def audio_function():
155
  global speech
156
  speech = transcript
157
  return transcript,asr_outputs["chunks"],asr_outputs["text"]
158
- return {
159
- "speakers": transcript,
160
- "chunks": asr_outputs["chunks"],
161
- "text": asr_outputs["text"],
162
- }
163
- a=time.time()
164
- DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
165
- CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
166
-
167
- CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
168
-
169
-
170
- CONFIG = wget.download(CONFIG_URL,"./")
171
- cfg = OmegaConf.load(CONFIG)
172
- # print(OmegaConf.to_yaml(cfg))
173
-
174
-
175
- # Create a manifest file for input with below format.
176
- # {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
177
- # "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
178
- import json
179
- meta = {
180
- 'audio_filepath': "current_out.wav",
181
- 'offset': 0,
182
- 'duration':None,
183
- 'label': 'infer',
184
- 'text': '-',
185
- 'num_speakers': None,
186
- 'rttm_filepath': None,
187
- 'uem_filepath' : None
188
- }
189
- with open(os.path.join('input_manifest.json'),'w') as fp:
190
- json.dump(meta,fp)
191
- fp.write('\n')
192
-
193
- cfg.diarizer.manifest_filepath = 'input_manifest.json'
194
- cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs
195
- pretrained_speaker_model = 'titanet_large'
196
- cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
197
- cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
198
- cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
199
- cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]
200
- cfg.diarizer.oracle_vad = True # ----> ORACLE VAD
201
- cfg.diarizer.clustering.parameters.oracle_num_speakers = False
202
- # cfg.diarizer.manifest_filepath = 'input_manifest.json'
203
- # # !cat {cfg.diarizer.manifest_filepath}
204
- # pretrained_speaker_model='titanet_large'
205
- # cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
206
- # cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs
207
- # cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
208
- # cfg.diarizer.clustering.parameters.oracle_num_speakers=False
209
-
210
- # Using Neural VAD and Conformer ASR
211
- cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
212
- cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
213
- cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
214
- cfg.diarizer.asr.parameters.asr_based_vad = False
215
-
216
-
217
- asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
218
- asr_model = asr_decoder_ts.set_asr_model()
219
- print(asr_model)
220
- word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
221
-
222
- print("Decoded word output dictionary: \n", word_hyp)
223
- print("Word-level timestamps dictionary: \n", word_ts_hyp)
224
-
225
 
226
- asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
227
- asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
228
-
229
- diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
230
- print("Diarization hypothesis output: \n", diar_hyp)
231
- trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
232
- # print(trans_info_dict)
233
-
234
- # with open(os.path.join('output_diarization.json'),'w') as fp1:
235
- # json.dump(trans_info_dict,fp1)
236
- # fp1.write('\n')
237
- # b = time.time()
238
- # print(b-a,"seconds diartization time for 50 min audio")
239
-
240
-
241
- import json
242
- context = ""
243
- context_2 = ""
244
- # global context_2
245
- # with open("output.json","r") as fli:
246
- # json_dict = json.load(fli)
247
- # for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False):
248
- # context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["text"]+"\n"
249
- # context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n"
250
- for dct in trans_info_dict["current_out"]["sentences"]:
251
- # context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n"
252
- context = context + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n"
253
- context_2 = context_2 + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n"
254
- global speech
255
- speech = trans_info_dict["current_out"]["transcription"]
256
-
257
- time_2 = time.time()
258
-
259
- return context,context_2,str(int(time_2-time_1)) + " seconds"
260
-
261
  def audio_function2():
262
  # Call the function and return its result to be displayed
263
 
 
155
  global speech
156
  speech = transcript
157
  return transcript,asr_outputs["chunks"],asr_outputs["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def audio_function2():
160
  # Call the function and return its result to be displayed
161