Spaces:

shukdevdatta123
/

GPT-4.5-Multimodal-Chatbot

Running

File size: 10,942 Bytes

8b97f99
 
 
 
ec333f1
8b97f99
c2740a5
8b97f99
 
c2740a5
8b97f99
 
 
 
 
c2740a5
8b97f99
 
c2740a5
8b97f99
 
b06076e
 
 
 
 
 
63271b3
8b97f99
 
 
b06076e
 
 
8b97f99
c2740a5
8b97f99
c2740a5
8b97f99
b06076e
 
 
 
 
 
 
3617d08
b06076e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3617d08
b06076e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a169fea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06076e
 
 
f386ba9
 
b06076e
ec333f1
 
b06076e
 
 
 
 
 
 
ec333f1
f386ba9
ec333f1
a169fea
8b97f99
a169fea
8b97f99
 
 
b06076e
 
 
 
30d63ae
b06076e
 
 
 
 
 
 
a0d1236
8b97f99
 
 
7057cb9
8b97f99
 
b06076e
 
 
3617d08
b06076e
8b97f99
b06076e
8b97f99
 
c2740a5
28eacfd
b06076e
 
 
 
28eacfd
b06076e
 
8b97f99
 
c2740a5
28eacfd
b06076e
 
8b97f99
 
c2740a5
28eacfd
 
 
a169fea
b06076e
ec333f1
28eacfd
ec333f1
30d63ae
a169fea
 
30d63ae
 
 
 
8b97f99
30d63ae
8b97f99
 
 
b06076e
 
 
 
f386ba9
28eacfd
a169fea
 
 
28eacfd
 
a169fea
 
 
30d63ae
a169fea
b06076e
 
 
 
 
 
a169fea
 
 
b06076e
 
 
8b97f99
 
 
a723167

import gradio as gr
import openai
import fitz  # PyMuPDF for PDF processing
import base64
import io

# Variable to store API key
api_key = ""

# Function to update API key
def set_api_key(key):
    global api_key
    api_key = key
    return "API Key Set Successfully!"

# Function to interact with OpenAI API
def query_openai(messages, temperature, top_p, max_output_tokens):
    if not api_key:
        return "Please enter your OpenAI API key first."

    try:
        openai.api_key = api_key  # Set API key dynamically

        # Ensure numeric values for OpenAI parameters
        temperature = float(temperature) if temperature else 1.0
        top_p = float(top_p) if top_p else 1.0
        max_output_tokens = int(max_output_tokens) if max_output_tokens else 2048

        response = openai.ChatCompletion.create(
            model="gpt-4.5-preview",
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_output_tokens
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error: {str(e)}"

# Function to process image URL input
def image_url_chat(image_url, text_query, temperature, top_p, max_output_tokens):
    if not image_url or not text_query:
        return "Please provide an image URL and a query."

    messages = [
        {"role": "user", "content": [
            {"type": "image_url", "image_url": {"url": image_url}},
            {"type": "text", "text": text_query}
        ]},
    ]
    return query_openai(messages, temperature, top_p, max_output_tokens)

# Function to process text input
def text_chat(text_query, temperature, top_p, max_output_tokens):
    if not text_query:
        return "Please enter a query."

    messages = [{"role": "user", "content": [{"type": "text", "text": text_query}]}]
    return query_openai(messages, temperature, top_p, max_output_tokens)

# Function to process uploaded image input
def image_chat(image_file, text_query, temperature, top_p, max_output_tokens):
    if image_file is None or not text_query:
        return "Please upload an image and provide a query."

    # Encode image as base64
    with open(image_file, "rb") as img:
        base64_image = base64.b64encode(img.read()).decode("utf-8")

    image_data = f"data:image/jpeg;base64,{base64_image}"

    messages = [
        {"role": "user", "content": [
            {"type": "image_url", "image_url": {"url": image_data}},
            {"type": "text", "text": text_query}
        ]},
    ]
    return query_openai(messages, temperature, top_p, max_output_tokens)

# Function to process uploaded PDF input
def pdf_chat(pdf_file, text_query, temperature, top_p, max_output_tokens):
    if pdf_file is None or not text_query:
        return "Please upload a PDF and provide a query."

    try:
        # Extract text from all pages of the PDF
        doc = fitz.open(pdf_file.name)
        text = "\n".join([page.get_text("text") for page in doc])  # Extract text from all pages

        # If no text found, return an error
        if not text.strip():
            return "No text found in the PDF."

        # Create the query message with the extracted text and the user's query
        messages = [
            {"role": "user", "content": [
                {"type": "text", "text": text},  # The extracted text from the PDF
                {"type": "text", "text": text_query}
            ]},
        ]
        return query_openai(messages, temperature, top_p, max_output_tokens)
    
    except Exception as e:
        return f"Error processing the PDF: {str(e)}"

# Function to process audio file and convert to text
def process_audio(audio_file, query, temperature, top_p, max_output_tokens):
    # Modified to handle numpy array or filepath depending on Audio component output
    try:
        if isinstance(audio_file, tuple):  # In case audio is returned as tuple (numpy array, sample rate)
            # Convert numpy array to WAV bytes in memory
            import numpy as np
            import scipy.io.wavfile as wav
            
            audio_data, sample_rate = audio_file
            buffer = io.BytesIO()
            wav.write(buffer, sample_rate, audio_data)
            buffer.seek(0)
            audio_binary = buffer.read()
        else:  # Filepath
            with open(audio_file, "rb") as f:
                audio_binary = f.read()
                
        # Transcribe the audio
        transcription = transcribe_audio(audio_binary, api_key)
        
        # Use the transcription and query to get a response
        messages = [
            {"role": "user", "content": [
                {"type": "text", "text": f"Transcription: {transcription}"},
                {"type": "text", "text": f"Query: {query}"}
            ]},
        ]
        return query_openai(messages, temperature, top_p, max_output_tokens)
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Function to transcribe audio to text using OpenAI Whisper API
def transcribe_audio(audio_binary, openai_api_key):
    if not openai_api_key:
        return "Error: No API key provided."
    
    openai.api_key = openai_api_key
    
    try:
        # Use the correct transcription API call
        audio_file_obj = io.BytesIO(audio_binary)
        audio_file_obj.name = 'audio.wav'  # Set a name for the file object (as OpenAI expects it)

        # Transcribe the audio to text using OpenAI's whisper model
        audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
        return audio_file_transcription.text
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

# Function to clear the chat
def clear_chat():
    return "", "", "", "", "", "", "", None, "", None, "", None, "", None, "", 1.0, 1.0, 2048

# Gradio UI Layout
with gr.Blocks() as demo:
    gr.Markdown("## GPT-4.5 Preview Chatbot")

    # Accordion for explaining hyperparameters
    with gr.Accordion("Hyperparameters", open=False):
        gr.Markdown(""" 
        ### Temperature: 
        Controls the randomness of the model's output. A lower temperature makes the model more deterministic, while a higher temperature makes it more creative and varied.
        ### Top-P (Nucleus Sampling): 
        Controls the cumulative probability distribution from which the model picks the next word. A lower value makes the model more focused and deterministic, while a higher value increases randomness.
        ### Max Output Tokens: 
        Limits the number of tokens (words or subwords) the model can generate in its response. You can use this to control the length of the response.
        """)

    # API Key Input
    with gr.Row():
        api_key_input = gr.Textbox(label="Enter OpenAI API Key", type="password")
        api_key_button = gr.Button("Set API Key", elem_id="api_key_button")
        api_key_output = gr.Textbox(label="API Key Status", interactive=False)

    with gr.Row():
        temperature = gr.Slider(0, 2, value=1.0, step=0.1, label="Temperature")
        top_p = gr.Slider(0, 1, value=1.0, step=0.1, label="Top-P")
        max_output_tokens = gr.Slider(0, 16384, value=2048, step=512, label="Max Output Tokens")
    
    with gr.Tabs():
        with gr.Tab("Image URL Chat"):
            image_url = gr.Textbox(label="Enter Image URL")
            image_query = gr.Textbox(label="Ask about the Image")
            image_url_output = gr.Textbox(label="Response", interactive=False)
            image_url_button = gr.Button("Ask", elem_id="ask_button")
        
        with gr.Tab("Text Chat"):
            text_query = gr.Textbox(label="Enter your query")
            text_output = gr.Textbox(label="Response", interactive=False)
            text_button = gr.Button("Ask", elem_id="ask_button")
        
        with gr.Tab("Image Chat"):
            image_upload = gr.File(label="Upload an Image", type="filepath")
            image_text_query = gr.Textbox(label="Ask about the uploaded image")
            image_output = gr.Textbox(label="Response", interactive=False)
            image_button = gr.Button("Ask", elem_id="ask_button")
        
        with gr.Tab("PDF Chat"):
            pdf_upload = gr.File(label="Upload a PDF", type="filepath")
            pdf_text_query = gr.Textbox(label="Ask about the uploaded PDF")
            pdf_output = gr.Textbox(label="Response", interactive=False)
            pdf_button = gr.Button("Ask", elem_id="ask_button")
        
        with gr.Tab("Voice Chat (Upload)"):
            audio_upload = gr.File(label="Upload an Audio File")
            audio_query = gr.Textbox(label="Ask about the transcription")
            audio_output = gr.Textbox(label="Response", interactive=False)
            audio_button = gr.Button("Ask", elem_id="ask_button")

        with gr.Tab("Voice(Record) Chat"):
            # Fix: Changed type to "numpy" which is supported in your Gradio version
            audio_record = gr.Audio(label="Record your voice", type="numpy")
            audio_record_query = gr.Textbox(label="Ask about the transcription")
            audio_record_output = gr.Textbox(label="Response", interactive=False)
            audio_record_button = gr.Button("Ask", elem_id="ask_button")

    # Clear chat button
    clear_button = gr.Button("Clear Chat", elem_id="clear_chat_button")

    # Button Click Actions
    api_key_button.click(set_api_key, inputs=[api_key_input], outputs=[api_key_output])
    image_url_button.click(image_url_chat, [image_url, image_query, temperature, top_p, max_output_tokens], image_url_output)
    text_button.click(text_chat, [text_query, temperature, top_p, max_output_tokens], text_output)
    image_button.click(image_chat, [image_upload, image_text_query, temperature, top_p, max_output_tokens], image_output)
    pdf_button.click(pdf_chat, [pdf_upload, pdf_text_query, temperature, top_p, max_output_tokens], pdf_output)
    
    # For Voice Chat (Upload)
    audio_button.click(process_audio, 
                      [audio_upload, audio_query, temperature, top_p, max_output_tokens], 
                      audio_output)
    
    # For Voice Chat (Record)
    audio_record_button.click(process_audio, 
                             [audio_record, audio_record_query, temperature, top_p, max_output_tokens], 
                             audio_record_output)

    # Clear button resets all necessary fields
    clear_button.click(
        clear_chat,
        outputs=[
            image_url, image_query, image_url_output, 
            text_query, text_output, 
            image_text_query, image_output, 
            pdf_upload, pdf_text_query, pdf_output,
            audio_upload, audio_query, audio_output,
            audio_record, audio_record_query, audio_record_output,
            temperature, top_p, max_output_tokens
        ]
    )

# Launch Gradio App
if __name__ == "__main__":
    demo.launch()