Spaces:

zeimoto
/

voiceoperation

Runtime error

File size: 3,467 Bytes

1726dba
065ecac
1726dba
d130ccc
2906c35
d130ccc
2906c35
d130ccc
2906c35
5cc4f06
bcfb814
5cc4f06
5767b04
 
d7e888d
 
 
bcfb814
2906c35
 
c5cb357
 
 
8222fa8
c5cb357
 
5cc4f06
c5cb357
5cc4f06
2906c35
 
c5cb357
5cc4f06
2906c35
 
 
c5cb357
 
5cc4f06
8222fa8
2906c35
 
 
 
5cc4f06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f27454
8b14f83
147a6a8
2906c35
eca6624
2906c35
 
 
8b14f83
2906c35
 
147a6a8
c5cb357
8b14f83
5cc4f06
 
 
8b14f83
5cc4f06
 
2906c35
 
 
 
c5cb357
8b14f83
2906c35
8b14f83
2906c35
 
 
 
 
5cc4f06
 
43c8625
5cc4f06

import streamlit as st
from st_audiorec import st_audiorec

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
#from datasets import load_dataset
import torch
from gliner import GLiNER

from resources import Lead_Labels, entity_labels, set_start, audit_elapsedtime


def main ():
    print("------------------------------")
    print(f"Running main")

    rec = init_model_trans()
    ner = init_model_ner() #async

    labels = entity_labels

    # text = "I have a proposal from cgd where they want one outsystems junior developers and one senior for an estimate of three hundred euros a day, for six months."
    # print(f"get entities from sample text: {text}")
    # get_entity_labels(model=ner, text=text, labels=labels)

    print("Rendering UI...")
    start_render = set_start()
    wav_audio_data = st_audiorec()
    audit_elapsedtime(function="Rendering UI", start=start_render)

    if wav_audio_data is not None and rec is not None:
        print("Loading data...")
        start_loading = set_start()
        st.audio(wav_audio_data, format='audio/wav')
        text = transcribe(wav_audio_data, rec)
        if text is not None:    
            get_entity_labels(labels=labels, model=ner, text=text)
        
        audit_elapsedtime(function="Loading data", start=start_loading)


def init_model_trans ():
    print("Initiating transcription model...")
    start = set_start()

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )
    print(f'Init model successful')
    audit_elapsedtime(function="Initiating transcription model", start=start)
    return pipe

def init_model_ner():
    print("Initiating NER model...")
    start = set_start()
    model = GLiNER.from_pretrained("urchade/gliner_multi")
    audit_elapsedtime(function="Initiating NER model", start=start)
    return model

def transcribe (audio_sample: bytes, pipe) -> str:
    print("Initiating transcription...")
    start = set_start()
    # dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
    # sample = dataset[0]["audio"]
    result = pipe(audio_sample)
    audit_elapsedtime(function="Transcription", start=start)
    print(result)
    
    st.write('trancription: ', result["text"])
    return result["text"]

def get_entity_labels(model: GLiNER, text: str, labels: list): #-> Lead_labels:
    print("Initiating entity recognition...")
    start = set_start()
    entities = model.predict_entities(text, labels)
    audit_elapsedtime(function="Retreiving entity labels from text", start=start)

    for entity in entities:
        print(entity["text"], "=>", entity["label"])
    st.write('Entities: ', entities)
    # return Lead_Labels()

if __name__ == "__main__":
    print("IN __name__")
    main()