Spaces:
Build error
Build error
import gradio as gr | |
import os | |
import time | |
# from omegaconf import OmegaConf | |
import shutil | |
import os | |
# import wget | |
import time | |
variable = [] | |
speech = "" | |
# context_2 = "" | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import logging | |
import torch | |
import os | |
import base64 | |
from pyannote.audio import Pipeline | |
from transformers import pipeline, AutoModelForCausalLM | |
from diarization_utils import diarize | |
from huggingface_hub import HfApi | |
from pydantic import ValidationError | |
from starlette.exceptions import HTTPException | |
# from config import model_settings, InferenceConfig | |
import logging | |
from pydantic import BaseModel | |
from pydantic_settings import BaseSettings | |
from typing import Optional, Literal | |
logger = logging.getLogger(__name__) | |
class ModelSettings(BaseSettings): | |
asr_model: str | |
assistant_model: Optional[str] | |
diarization_model: Optional[str] | |
hf_token: Optional[str] | |
class InferenceConfig(BaseModel): | |
task: Literal["transcribe", "translate"] = "transcribe" | |
batch_size: int = 24 | |
assisted: bool = False | |
chunk_length_s: int = 30 | |
sampling_rate: int = 16000 | |
language: Optional[str] = None | |
num_speakers: Optional[int] = None | |
min_speakers: Optional[int] = None | |
max_speakers: Optional[int] = None | |
# from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR | |
# from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
# logger.info(f"Using device: {device.type}") | |
torch_dtype = torch.float32 if device.type == "cpu" else torch.float16 | |
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True) | |
model = AutoModel.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True,device_map='auto') | |
# base_model = "lyogavin/Anima-7B-100K" | |
# tokenizer = AutoTokenizer.from_pretrained(base_model) | |
# model = AutoModelForCausalLM.from_pretrained( | |
# base_model, | |
# bnb_4bit_compute_dtype=torch.float16, | |
# # torch_dtype=torch.float16, | |
# trust_remote_code=True, | |
# device_map="auto", | |
# load_in_4bit=True | |
# ) | |
# model.eval() | |
assistant_model = AutoModelForCausalLM.from_pretrained( | |
"distil-whisper/distil-large-v3", | |
torch_dtype=torch_dtype, | |
low_cpu_mem_usage=True, | |
use_safetensors=True | |
) | |
assistant_model.to(device) | |
asr_pipeline = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-large-v3", | |
torch_dtype=torch_dtype, | |
device=device | |
) | |
HfApi().whoami(os.getenv('HF_TOKEN')) | |
diarization_pipeline = Pipeline.from_pretrained( | |
checkpoint_path="pyannote/speaker-diarization-3.1", | |
use_auth_token=os.getenv('HF_TOKEN'), | |
) | |
diarization_pipeline.to(device) | |
def upload_file(files): | |
file_paths = [file.name for file in files] | |
global variable | |
variable = file_paths | |
return file_paths | |
def audio_function(): | |
# Call the function and return its result to be displayed | |
time_1 = time.time() | |
paths = variable | |
str1 = "processed speech" | |
for i in paths: | |
str1 = str1 + i | |
str1=str1.replace("processed speech","") | |
print("before processing ffmpeg ! ") | |
command_to_mp4_to_wav = "ffmpeg -i {} current_out.wav -y" | |
#-acodec pcm_s16le -ar 16000 -ac 1 | |
os.system(command_to_mp4_to_wav.format(str1)) | |
print("after ffmpeg") | |
# os.system("insanely-fast-whisper --file-name {}_new.wav --task transcribe --hf_token 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".format(str1.replace("mp3",""))) | |
parameters = InferenceConfig() | |
generate_kwargs = { | |
"task": parameters.task, | |
"language": parameters.language, | |
"assistant_model": assistant_model if parameters.assisted else None | |
} | |
asr_outputs = asr_pipeline( | |
"current_out.wav", | |
chunk_length_s=parameters.chunk_length_s, | |
batch_size=parameters.batch_size, | |
generate_kwargs=generate_kwargs, | |
return_timestamps=True, | |
) | |
transcript = diarize(diarization_pipeline, "current_out.wav", parameters, asr_outputs) | |
return transcript,asr_outputs["chunks"],asr_outputs["text"] | |
return { | |
"speakers": transcript, | |
"chunks": asr_outputs["chunks"], | |
"text": asr_outputs["text"], | |
} | |
a=time.time() | |
DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file | |
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml" | |
CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}" | |
CONFIG = wget.download(CONFIG_URL,"./") | |
cfg = OmegaConf.load(CONFIG) | |
# print(OmegaConf.to_yaml(cfg)) | |
# Create a manifest file for input with below format. | |
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", | |
# "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"} | |
import json | |
meta = { | |
'audio_filepath': "current_out.wav", | |
'offset': 0, | |
'duration':None, | |
'label': 'infer', | |
'text': '-', | |
'num_speakers': None, | |
'rttm_filepath': None, | |
'uem_filepath' : None | |
} | |
with open(os.path.join('input_manifest.json'),'w') as fp: | |
json.dump(meta,fp) | |
fp.write('\n') | |
cfg.diarizer.manifest_filepath = 'input_manifest.json' | |
cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs | |
pretrained_speaker_model = 'titanet_large' | |
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model | |
cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5] | |
cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1] | |
cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1] | |
cfg.diarizer.oracle_vad = True # ----> ORACLE VAD | |
cfg.diarizer.clustering.parameters.oracle_num_speakers = False | |
# cfg.diarizer.manifest_filepath = 'input_manifest.json' | |
# # !cat {cfg.diarizer.manifest_filepath} | |
# pretrained_speaker_model='titanet_large' | |
# cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath | |
# cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs | |
# cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model | |
# cfg.diarizer.clustering.parameters.oracle_num_speakers=False | |
# Using Neural VAD and Conformer ASR | |
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet' | |
cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large' | |
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD | |
cfg.diarizer.asr.parameters.asr_based_vad = False | |
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer) | |
asr_model = asr_decoder_ts.set_asr_model() | |
print(asr_model) | |
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model) | |
print("Decoded word output dictionary: \n", word_hyp) | |
print("Word-level timestamps dictionary: \n", word_ts_hyp) | |
asr_diar_offline = OfflineDiarWithASR(cfg.diarizer) | |
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset | |
diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp) | |
print("Diarization hypothesis output: \n", diar_hyp) | |
trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp) | |
# print(trans_info_dict) | |
# with open(os.path.join('output_diarization.json'),'w') as fp1: | |
# json.dump(trans_info_dict,fp1) | |
# fp1.write('\n') | |
# b = time.time() | |
# print(b-a,"seconds diartization time for 50 min audio") | |
import json | |
context = "" | |
context_2 = "" | |
# global context_2 | |
# with open("output.json","r") as fli: | |
# json_dict = json.load(fli) | |
# for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False): | |
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["text"]+"\n" | |
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n" | |
for dct in trans_info_dict["current_out"]["sentences"]: | |
# context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n" | |
context = context + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n" | |
context_2 = context_2 + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n" | |
global speech | |
speech = trans_info_dict["current_out"]["transcription"] | |
time_2 = time.time() | |
return context,context_2,str(int(time_2-time_1)) + " seconds" | |
def audio_function2(): | |
# Call the function and return its result to be displayed | |
# global speech | |
str2 = speech | |
time_3 = time.time() | |
# prompt = " {} generate medical subjective objective assessment plan (soap) notes ?".format(str2) | |
prompt = " {} summary of sales call ? is the agent qualified the lead properly ?".format(str2) | |
# model = model.eval() | |
response, history = model.chat(tokenizer, prompt, history=[]) | |
print(response) | |
# del model | |
# del tokenizer | |
# torch.cuda.empty_cache() | |
time_4 = time.time() | |
# response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) | |
# print(response) | |
# inputs = tokenizer(prompt, return_tensors="pt") | |
# inputs['input_ids'] = inputs['input_ids'].cuda() | |
# inputs['attention_mask'] = inputs['attention_mask'].cuda() | |
# generate_ids = model.generate(**inputs, max_new_tokens=4096, | |
# only_last_logit=True, # to save memory | |
# use_cache=False, # when run into OOM, enable this can save memory | |
# xentropy=True) | |
# output = tokenizer.batch_decode(generate_ids, | |
# skip_special_tokens=True, | |
# clean_up_tokenization_spaces=False) | |
# tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K") | |
# model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K", trust_remote_code=True, torch_dtype=torch.float16,device_map="auto",bnb_4bit_compute_dtype=torch.float16,load_in_4bit=True) | |
# input_context = "summarize "+" the following {}".format(str2) | |
# input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda() | |
# output = model.generate(input_ids, max_new_tokens=512, temperature=0.7) | |
# output_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
# print(output_text,"wow what happened ") | |
# return output | |
return response,str(int(time_4-time_3)) + " seconds" | |
with gr.Blocks() as demo: | |
file_output = gr.File() | |
upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio","video"], file_count="multiple") | |
upload_button.upload(upload_file, upload_button, file_output) | |
gr.Markdown("## Click process audio to display text from audio file") | |
submit_button = gr.Button("Process Audio") | |
output_text = gr.Textbox(label="Speech Diarization") | |
output_text_2 = gr.Textbox(label="Speech chunks") | |
submit_button.click(audio_function, outputs=[output_text,output_text_2,gr.Textbox(label=" asr_text :")]) | |
gr.Markdown("## Click the Summarize to display call summary") | |
submit_button = gr.Button("Summarize") | |
output_text = gr.Textbox(label="SOAP Notes") | |
submit_button.click(audio_function2, outputs=[output_text,gr.Textbox(label="Time Taken :")]) | |
demo.launch(server_name="0.0.0.0",auth = ('manish', 'openrainbow'),auth_message = "Enter your credentials") |