File size: 2,897 Bytes
5ad5566
03ddc3f
 
3597c88
a177e4d
03ddc3f
3597c88
03ddc3f
 
 
3597c88
a177e4d
 
03ddc3f
3597c88
03ddc3f
 
 
 
 
 
87f602f
5195d28
 
 
 
 
 
a177e4d
5195d28
 
 
 
 
 
 
 
 
a177e4d
 
3e9dc66
 
 
 
 
 
5f49ba8
3e9dc66
 
 
10fea9b
3e9dc66
 
 
 
 
5195d28
 
 
 
 
 
 
 
 
3597c88
 
0d16ed8
 
5195d28
 
0d16ed8
 
a3ea009
3597c88
29000fa
 
 
0d16ed8
5f013ee
1a99cb6
3e9dc66
0d16ed8
 
 
 
29000fa
 
528f59d
5ad5566
 
3597c88
d13b26e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import spaces
import torch
import gradio as gr
from openai import OpenAI
from transformers import pipeline

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

@spaces.GPU
def respond_to_question(transcript, question):
    # Optionally, use OpenAI API to generate a response to the user's question
    # based on the transcript
    response = ""
    # Replace this with your OpenAI API key
    response = client.completions.create(
        engine="gpt-4o-mini",
        prompt=f"Transcript: {transcript}\n\nUser: {question}\n\nAI:",
        temperature=0.3,
        max_tokens=60,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    ).choices[0].text
    
    return response

@spaces.GPU
def respond_to_question_llama(transcript, question):
    from huggingface_hub import InferenceClient

    client = InferenceClient(
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
        token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    )

    response = client.chat_completion(
        messages=[{"role": "user", "content": f"Transcript: {transcript}\n\nUser: {question}"}],
        max_tokens=500,
    ).choices[0].content

    return response

@spaces.GPU
def audio_transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, return_timestamps=True)["text"]

    return  text

with gr.Blocks() as transcriberUI:
    gr.Markdown(
        """
        # Ola!
        Clique no botao abaixo para selecionar o Audio que deseja conversar!
        Ambiente disponivel 24x7. Running on ZeroGPU with openai/whisper-large-v3
        """
    )
    inp = gr.File(label="Arquivo de Audio", show_label=True, type="filepath", file_count="single", file_types=["mp3"])
    transcribe = gr.Textbox(label="Transcricao", show_label=True, show_copy_button=True)
    ask_question = gr.Textbox(label="Ask a question", visible=True)
    response_output = gr.Textbox(label="Response", visible=True)
    submit_question = gr.Button("Submit question", visible=True)

    def ask_question_callback(transcription,question):
        if ask_question:
            response = respond_to_question_llama(transcription, question)
            response_output.value = response
        else:
            response_output.value = "No question asked"

        return response_output

    inp.upload(audio_transcribe, inputs=inp, outputs=transcribe)
    submit_question.click(ask_question_callback, outputs=[response_output], inputs=[transcribe, ask_question])


transcriberUI.queue().launch(debug=True)