Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -155,109 +155,7 @@ def audio_function():
|
|
155 |
global speech
|
156 |
speech = transcript
|
157 |
return transcript,asr_outputs["chunks"],asr_outputs["text"]
|
158 |
-
return {
|
159 |
-
"speakers": transcript,
|
160 |
-
"chunks": asr_outputs["chunks"],
|
161 |
-
"text": asr_outputs["text"],
|
162 |
-
}
|
163 |
-
a=time.time()
|
164 |
-
DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
|
165 |
-
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
|
166 |
-
|
167 |
-
CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
|
168 |
-
|
169 |
-
|
170 |
-
CONFIG = wget.download(CONFIG_URL,"./")
|
171 |
-
cfg = OmegaConf.load(CONFIG)
|
172 |
-
# print(OmegaConf.to_yaml(cfg))
|
173 |
-
|
174 |
-
|
175 |
-
# Create a manifest file for input with below format.
|
176 |
-
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
|
177 |
-
# "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
|
178 |
-
import json
|
179 |
-
meta = {
|
180 |
-
'audio_filepath': "current_out.wav",
|
181 |
-
'offset': 0,
|
182 |
-
'duration':None,
|
183 |
-
'label': 'infer',
|
184 |
-
'text': '-',
|
185 |
-
'num_speakers': None,
|
186 |
-
'rttm_filepath': None,
|
187 |
-
'uem_filepath' : None
|
188 |
-
}
|
189 |
-
with open(os.path.join('input_manifest.json'),'w') as fp:
|
190 |
-
json.dump(meta,fp)
|
191 |
-
fp.write('\n')
|
192 |
-
|
193 |
-
cfg.diarizer.manifest_filepath = 'input_manifest.json'
|
194 |
-
cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs
|
195 |
-
pretrained_speaker_model = 'titanet_large'
|
196 |
-
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
|
197 |
-
cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
|
198 |
-
cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
|
199 |
-
cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]
|
200 |
-
cfg.diarizer.oracle_vad = True # ----> ORACLE VAD
|
201 |
-
cfg.diarizer.clustering.parameters.oracle_num_speakers = False
|
202 |
-
# cfg.diarizer.manifest_filepath = 'input_manifest.json'
|
203 |
-
# # !cat {cfg.diarizer.manifest_filepath}
|
204 |
-
# pretrained_speaker_model='titanet_large'
|
205 |
-
# cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
|
206 |
-
# cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs
|
207 |
-
# cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
|
208 |
-
# cfg.diarizer.clustering.parameters.oracle_num_speakers=False
|
209 |
-
|
210 |
-
# Using Neural VAD and Conformer ASR
|
211 |
-
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
|
212 |
-
cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
|
213 |
-
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
|
214 |
-
cfg.diarizer.asr.parameters.asr_based_vad = False
|
215 |
-
|
216 |
-
|
217 |
-
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
|
218 |
-
asr_model = asr_decoder_ts.set_asr_model()
|
219 |
-
print(asr_model)
|
220 |
-
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
|
221 |
-
|
222 |
-
print("Decoded word output dictionary: \n", word_hyp)
|
223 |
-
print("Word-level timestamps dictionary: \n", word_ts_hyp)
|
224 |
-
|
225 |
|
226 |
-
asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
|
227 |
-
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
|
228 |
-
|
229 |
-
diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
|
230 |
-
print("Diarization hypothesis output: \n", diar_hyp)
|
231 |
-
trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
|
232 |
-
# print(trans_info_dict)
|
233 |
-
|
234 |
-
# with open(os.path.join('output_diarization.json'),'w') as fp1:
|
235 |
-
# json.dump(trans_info_dict,fp1)
|
236 |
-
# fp1.write('\n')
|
237 |
-
# b = time.time()
|
238 |
-
# print(b-a,"seconds diartization time for 50 min audio")
|
239 |
-
|
240 |
-
|
241 |
-
import json
|
242 |
-
context = ""
|
243 |
-
context_2 = ""
|
244 |
-
# global context_2
|
245 |
-
# with open("output.json","r") as fli:
|
246 |
-
# json_dict = json.load(fli)
|
247 |
-
# for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False):
|
248 |
-
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["text"]+"\n"
|
249 |
-
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n"
|
250 |
-
for dct in trans_info_dict["current_out"]["sentences"]:
|
251 |
-
# context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n"
|
252 |
-
context = context + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n"
|
253 |
-
context_2 = context_2 + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n"
|
254 |
-
global speech
|
255 |
-
speech = trans_info_dict["current_out"]["transcription"]
|
256 |
-
|
257 |
-
time_2 = time.time()
|
258 |
-
|
259 |
-
return context,context_2,str(int(time_2-time_1)) + " seconds"
|
260 |
-
|
261 |
def audio_function2():
|
262 |
# Call the function and return its result to be displayed
|
263 |
|
|
|
155 |
global speech
|
156 |
speech = transcript
|
157 |
return transcript,asr_outputs["chunks"],asr_outputs["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
def audio_function2():
|
160 |
# Call the function and return its result to be displayed
|
161 |
|