File size: 3,271 Bytes
5ad5566
03ddc3f
 
3597c88
03ddc3f
3597c88
03ddc3f
 
 
3597c88
03ddc3f
3597c88
03ddc3f
 
 
 
 
 
87f602f
3e9dc66
 
 
 
a84f44c
5f49ba8
3e9dc66
 
 
10fea9b
5aa403b
787c0bf
3e9dc66
 
 
5195d28
 
 
 
 
b040059
5195d28
0c9b4d5
 
be72dc1
 
5195d28
3597c88
 
0d16ed8
 
5195d28
 
0d16ed8
 
7894a90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3597c88
0c9b4d5
 
 
763daee
0d16ed8
5f013ee
1a99cb6
3e9dc66
0d16ed8
ac8ebf8
0d16ed8
ac8ebf8
29000fa
7894a90
5ad5566
0c9b4d5
5ad5566
3597c88
763daee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import spaces
import torch
import gradio as gr
from transformers import pipeline

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

def respond_to_question_llama(transcript, question):
    from huggingface_hub import InferenceClient

    client = InferenceClient(
        "meta-llama/Meta-Llama-3.1-70B-Instruct",
        token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    )

    response = client.chat_completion(
        messages=[{"role": "user", "content": f"Transcript: {transcript}\n\nUser: {question}"}],
        max_tokens=4096,
    ).choices[0].message.content

    return response

@spaces.GPU
def audio_transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)['text']

    return [text, gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Textbox(visible=True)]

def hidden_ask_question():
    return [gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Textbox(visible=False)]

with gr.Blocks() as transcriberUI:
    gr.Markdown(
        """
        # Ola!
        Clique no botao abaixo para selecionar o Audio que deseja conversar!
        Ambiente disponivel 24x7. Running on ZeroGPU with openai/whisper-large-v3
        """
    )

    file_transcribe = gr.Interface(
        fn=transcribe,
        inputs=gr.Audio(sources="upload", type="filepath", label="Audio file", format=["mp3","m4a"]),
        outputs=[audio_transcribe,ask_question,submit_question, response_output],
        title="Chat with your Audio",
        description=(
            "Transcribe and Chat with your audio inputs with the click of a button! This prototype uses the"
            f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
            " of arbitrary length."
        ),
        allow_flagging="never",
    )

    #inp = gr.Audio(sources="upload", type="filepath", label="Audio file", format=["mp3","m4a"])
    transcribe = gr.Textbox(label="Transcricao", show_label=True, show_copy_button=True)
    ask_question = gr.Textbox(label="Ask a question", visible=False)
    response_output = gr.Textbox(label="Response", visible=False)
    submit_question = gr.Button("Submit question", visible=False)
    clear_button = gr.ClearButton([transcribe,response_output,inp, ask_question]) 

    def ask_question_callback(transcription,question):
        if ask_question:
            response = respond_to_question_llama(transcription, question)
        else:
            response = "No question asked"

        return response

    #inp.upload(audio_transcribe, inputs=inp, outputs=[transcribe,ask_question,submit_question, response_output])
    submit_question.click(ask_question_callback, outputs=[response_output], inputs=[transcribe, ask_question])
    clear_button.click(hidden_ask_question,outputs=[ask_question,response_output,submit_question])


transcriberUI.queue().launch()