Spaces:

mohan007
/

sales_audio_analysis

Build error

App Files Files Community

mohan007 commited on Jan 7

Commit

f77db10

verified ·

1 Parent(s): 52be56c

Create app.py

Browse files

Files changed (1) hide show

app.py +321 -0

app.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import gradio as gr
+import os
+import time
+# from omegaconf import OmegaConf
+import shutil
+import os
+import wget
+import time
+variable = []
+speech = ""
+# context_2 = ""
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from transformers import AutoTokenizer, AutoModel
+import logging
+import torch
+import os
+import base64
+from pyannote.audio import Pipeline
+from transformers import pipeline, AutoModelForCausalLM
+from diarization_utils import diarize
+from huggingface_hub import HfApi
+from pydantic import ValidationError
+from starlette.exceptions import HTTPException
+# from config import model_settings, InferenceConfig
+import logging
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+from typing import Optional, Literal
+logger = logging.getLogger(__name__)
+class ModelSettings(BaseSettings):
+    asr_model: str
+    assistant_model: Optional[str]
+    diarization_model: Optional[str]
+    hf_token: Optional[str]
+class InferenceConfig(BaseModel):
+    task: Literal["transcribe", "translate"] = "transcribe"
+    batch_size: int = 24
+    assisted: bool = False
+    chunk_length_s: int = 30
+    sampling_rate: int = 16000
+    language: Optional[str] = None
+    num_speakers: Optional[int] = None
+    min_speakers: Optional[int] = None
+    max_speakers: Optional[int] = None
+# from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
+# from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# logger.info(f"Using device: {device.type}")
+torch_dtype = torch.float32 if device.type == "cpu" else torch.float16
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True,device_map='auto')
+# base_model = "lyogavin/Anima-7B-100K"
+# tokenizer = AutoTokenizer.from_pretrained(base_model)
+# model = AutoModelForCausalLM.from_pretrained(
+#         base_model,
+#         bnb_4bit_compute_dtype=torch.float16,
+#         # torch_dtype=torch.float16,
+#         trust_remote_code=True,
+#         device_map="auto",
+#         load_in_4bit=True
+#         )
+# model.eval()
+assistant_model = AutoModelForCausalLM.from_pretrained(
+    "distil-whisper/distil-large-v3",
+    torch_dtype=torch_dtype,
+    low_cpu_mem_usage=True,
+    use_safetensors=True
+)
+assistant_model.to(device)
+asr_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-large-v3",
+    torch_dtype=torch_dtype,
+    device=device
+)
+HfApi().whoami(os.getenv('HF_TOKEN'))
+diarization_pipeline = Pipeline.from_pretrained(
+    checkpoint_path="pyannote/speaker-diarization-3.1",
+    use_auth_token=os.getenv('HF_TOKEN'),
+)
+diarization_pipeline.to(device)
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    global variable
+    variable = file_paths
+    return file_paths
+def audio_function():
+    # Call the function and return its result to be displayed
+    time_1 = time.time()
+    paths = variable
+    str1 = "processed speech"
+    for i in paths:
+        str1 = str1 + i
+    str1=str1.replace("processed speech","")
+    print("before processing ffmpeg ! ")
+    command_to_mp4_to_wav =   "ffmpeg -i {}    current_out.wav -y"
+    #-acodec pcm_s16le -ar 16000 -ac 1
+    os.system(command_to_mp4_to_wav.format(str1))
+    print("after ffmpeg")
+    # os.system("insanely-fast-whisper  --file-name {}_new.wav  --task transcribe --hf_token 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".format(str1.replace("mp3","")))
+    parameters = InferenceConfig()
+    generate_kwargs = {
+        "task": parameters.task,
+        "language": parameters.language,
+        "assistant_model": assistant_model if parameters.assisted else None
+    }
+    asr_outputs = asr_pipeline(
+        "current_out.wav",
+        chunk_length_s=parameters.chunk_length_s,
+        batch_size=parameters.batch_size,
+        generate_kwargs=generate_kwargs,
+        return_timestamps=True,
+    )
+    transcript = diarize(diarization_pipeline, "current_out.wav", parameters, asr_outputs)
+    return transcript,asr_outputs["chunks"],asr_outputs["text"]
+    return {
+        "speakers": transcript,
+        "chunks": asr_outputs["chunks"],
+        "text": asr_outputs["text"],
+    }
+    a=time.time()
+    DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
+    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
+    CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
+    CONFIG = wget.download(CONFIG_URL,"./")
+    cfg = OmegaConf.load(CONFIG)
+    # print(OmegaConf.to_yaml(cfg))
+    # Create a manifest file for input with below format.
+    # {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
+    # "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
+    import json
+    meta = {
+        'audio_filepath': "current_out.wav",
+        'offset': 0,
+        'duration':None,
+        'label': 'infer',
+        'text': '-',
+        'num_speakers': None,
+        'rttm_filepath': None,
+        'uem_filepath' : None
+    }
+    with open(os.path.join('input_manifest.json'),'w') as fp:
+        json.dump(meta,fp)
+        fp.write('\n')
+    cfg.diarizer.manifest_filepath = 'input_manifest.json'
+    cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs
+    pretrained_speaker_model = 'titanet_large'
+    cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
+    cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
+    cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
+    cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]
+    cfg.diarizer.oracle_vad = True # ----> ORACLE VAD
+    cfg.diarizer.clustering.parameters.oracle_num_speakers = False
+    # cfg.diarizer.manifest_filepath = 'input_manifest.json'
+    # # !cat {cfg.diarizer.manifest_filepath}
+    # pretrained_speaker_model='titanet_large'
+    # cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
+    # cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs
+    # cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
+    # cfg.diarizer.clustering.parameters.oracle_num_speakers=False
+    # Using Neural VAD and Conformer ASR
+    cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
+    cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
+    cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
+    cfg.diarizer.asr.parameters.asr_based_vad = False
+    asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
+    asr_model = asr_decoder_ts.set_asr_model()
+    print(asr_model)
+    word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
+    print("Decoded word output dictionary: \n", word_hyp)
+    print("Word-level timestamps dictionary: \n", word_ts_hyp)
+    asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
+    asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
+    diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
+    print("Diarization hypothesis output: \n", diar_hyp)
+    trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
+    # print(trans_info_dict)
+    # with open(os.path.join('output_diarization.json'),'w') as fp1:
+        # json.dump(trans_info_dict,fp1)
+        # fp1.write('\n')
+    # b = time.time()
+    # print(b-a,"seconds  diartization time for 50 min audio")
+    import json
+    context  = ""
+    context_2 = ""
+    # global context_2
+    # with open("output.json","r") as fli:
+        # json_dict = json.load(fli)
+    # for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False):
+        # context = context +  str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " +  lst["text"]+"\n"
+        # context = context +  str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n"
+    for dct in trans_info_dict["current_out"]["sentences"]:
+        # context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n"
+        context = context +  str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n"
+        context_2 = context_2 +  str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n"
+    global speech
+    speech = trans_info_dict["current_out"]["transcription"]
+    time_2 = time.time()
+    return context,context_2,str(int(time_2-time_1)) + " seconds"
+def audio_function2():
+    # Call the function and return its result to be displayed
+    # global speech
+    str2 = speech
+    time_3   = time.time()
+    # prompt = " {}  generate medical subjective objective assessment plan (soap) notes ?".format(str2)
+    prompt = " {}  summary of sales call ? is the agent qualified the lead properly ?".format(str2)
+    # model = model.eval()
+    response, history = model.chat(tokenizer, prompt, history=[])
+    print(response)
+    # del model
+    # del tokenizer
+    # torch.cuda.empty_cache()
+    time_4 = time.time()
+    # response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
+    # print(response)
+    # inputs = tokenizer(prompt, return_tensors="pt")
+    # inputs['input_ids'] = inputs['input_ids'].cuda()
+    # inputs['attention_mask'] = inputs['attention_mask'].cuda()
+    # generate_ids = model.generate(**inputs, max_new_tokens=4096,
+    #                     only_last_logit=True, # to save memory
+    #                     use_cache=False, # when run into OOM, enable this can save memory
+    #                     xentropy=True)
+    # output = tokenizer.batch_decode(generate_ids,
+    #                             skip_special_tokens=True,
+    #                             clean_up_tokenization_spaces=False)
+    # tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
+    # model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K", trust_remote_code=True, torch_dtype=torch.float16,device_map="auto",bnb_4bit_compute_dtype=torch.float16,load_in_4bit=True)
+    # input_context = "summarize "+" the following {}".format(str2)
+    # input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda()
+    # output = model.generate(input_ids, max_new_tokens=512, temperature=0.7)
+    # output_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # print(output_text,"wow what happened ")
+    # return output
+    return response,str(int(time_4-time_3)) + " seconds"
+with gr.Blocks() as demo:
+    file_output = gr.File()
+    upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio","video"], file_count="multiple")
+    upload_button.upload(upload_file, upload_button, file_output)
+    gr.Markdown("## Click process audio to display text from audio file")
+    submit_button = gr.Button("Process Audio")
+    output_text = gr.Textbox(label="Speech Diarization")
+    output_text_2 = gr.Textbox(label="Speech chunks")
+    submit_button.click(audio_function, outputs=[output_text,output_text_2,gr.Textbox(label=" asr_text :")])
+    gr.Markdown("## Click the Summarize to display call summary")
+    submit_button = gr.Button("Summarize")
+    output_text = gr.Textbox(label="SOAP Notes")
+    submit_button.click(audio_function2, outputs=[output_text,gr.Textbox(label="Time Taken :")])
+demo.launch(server_name="0.0.0.0",auth = ('manish', 'openrainbow'),auth_message = "Enter your credentials")