Spaces:

zeimoto
/

voiceoperation

Runtime error

File size: 3,299 Bytes

1726dba
065ecac
1726dba
d130ccc
2906c35
d130ccc
2906c35
d130ccc
2906c35
5cc4f06
bcfb814
 
7c463a6
bcfb814
5cc4f06
bcfb814
7c463a6
 
2906c35
bcfb814
 
 
 
 
 
 
2906c35
 
 
 
 
8222fa8
147a6a8
5cc4f06
 
2906c35
 
5cc4f06
2906c35
 
 
5cc4f06
8222fa8
2906c35
 
 
 
5cc4f06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147a6a8
8b14f83
147a6a8
2906c35
eca6624
2906c35
 
 
8b14f83
2906c35
 
147a6a8
8b14f83
5cc4f06
 
 
8b14f83
5cc4f06
 
2906c35
 
 
 
8b14f83
2906c35
8b14f83
2906c35
 
 
 
 
5cc4f06
 
43c8625
5cc4f06

import streamlit as st
from st_audiorec import st_audiorec

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
#from datasets import load_dataset
import torch
from gliner import GLiNER

from resources import Lead_Labels, entity_labels, set_start, audit_elapsedtime

rec = None
ner = None
iteration = 0

def main ():

    print(f"Main iteration {iteration}")
    iteration += 1
    
    if rec is None:
        print("rec is None")
        rec = init_model_trans()
    if ner is None:
        print("ner is None")
        ner = init_model_ner() #async

    labels = entity_labels

    text = "I have a proposal from cgd where they want one outsystems junior developers and one senior for an estimate of three hundred euros a day, for six months."
    print(f"get entities from sample text: {text}")
    get_entity_labels(model=ner, text=text, labels=labels)

    print("Render UI")
    wav_audio_data = st_audiorec()

    if wav_audio_data is not None and rec is not None:
        print("Loading data...")
        st.audio(wav_audio_data, format='audio/wav')
        text = transcribe(wav_audio_data, rec)
        if text is not None:    
            get_entity_labels(labels=labels, model=ner, text=text)


def init_model_trans ():
    print("Initiating transcription model...")
    start = set_start()

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )
    print(f'Init model successful: {model}' )
    audit_elapsedtime(function="Initiating transcription model", start=start)
    return pipe

def init_model_ner():
    print("Initiating NER model...")
    start = set_start()
    model = GLiNER.from_pretrained("urchade/gliner_multi")
    audit_elapsedtime(function="Initiating NER model", start=start)
    return model

def transcribe (audio_sample: bytes, pipe) -> str:
    start = set_start()
    # dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
    # sample = dataset[0]["audio"]
    result = pipe(audio_sample)
    audit_elapsedtime(function="Transcription", start=start)
    print(result)
    
    st.write('trancription: ', result["text"])
    return result["text"]

def get_entity_labels(model: GLiNER, text: str, labels: list): #-> Lead_labels:
    start = set_start()
    entities = model.predict_entities(text, labels)
    audit_elapsedtime(function="Retreiving entity labels from text", start=start)

    for entity in entities:
        print(entity["text"], "=>", entity["label"])
    st.write('Entities: ', entities)
    # return Lead_Labels()

if __name__ == "__main__":
    print("IN __name__")
    main()