File size: 2,837 Bytes
1c66a80
 
 
 
 
ed0abb5
f67682f
 
678aa20
 
1c66a80
5a85225
93c16a2
f888789
97f56b5
 
ae21500
 
97f56b5
 
12d3854
 
246aa34
5a85225
12d3854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08f3107
12d3854
 
 
 
c1347d6
12d3854
 
 
d66b86d
12d3854
 
1c66a80
08f3107
1c66a80
08f3107
1c66a80
cefb089
e103d16
cefb089
1c66a80
cefb089
1c66a80
 
 
678aa20
1c66a80
 
 
 
 
89f0384
e687694
1c66a80
 
 
 
 
 
246aa34
 
1c66a80
b91edc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
from openai import OpenAI

prompt = "Type and press Enter"


def record_text(audio_file,api_key):
    client = OpenAI(api_key = api_key)
    audio_file = open(audio_file, "rb") 
    transcript = client.audio.transcriptions.create(
        model="whisper-1", 
        file=audio_file,
        response_format="text"
        ) 
    return transcript


def api_calling(audio_file, prompt, api_key):
    audio_text = record_text(audio_file,api_key)
    if len(prompt) == 0:
        prompt = "Apply proper punctuations, upper case and lower case to the provided text."
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "text",
                        "text": audio_text
                    }
                ]
            }
        ],
        "max_tokens": 1500
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    audio_text_res = response.json()
    return audio_text_res["choices"][0]["message"]["content"]



def message_and_history(audio_text,input, history, api_key):
    history = history or []
    output_text = api_calling(audio_text,input,api_key)
    
    if len(input) == 0:
        input = "Speech from the video."
        history.append((input, output_text))
    else:
        history.append((input, output_text))
    
    return history, history


block = gr.Blocks(theme=gr.themes.Glass(primary_hue="slate"))
with block:
    gr.Markdown("""<h1><center>Audio Recognition - Ask and Learn about an Audio</center></h1> """)
    with gr.Row():
        with gr.Column(scale=0.5):
            aud_input = gr.Audio(type="filepath", label="Upload Audio", sources="upload")
            api_input = gr.Textbox(label="Enter Api-key")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
        with gr.Column():
            chatbot = gr.Chatbot(label="Ask questions about the audio")
            message = gr.Textbox(label="User", placeholder=prompt)
            state = gr.State()
            
    upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(lambda: None, None, message, queue=False)
block.launch()