import gradio as gr
from openai import OpenAI
import requests
from PIL import Image
from io import BytesIO

def process_text(api_key, example, question):
    if example:
        question=example
    client = OpenAI(api_key=api_key)

    image_response = client.images.generate(model="dall-e-3", prompt=question,size="1024x1024",quality="standard",n=1,)
    image_url = image_response.data[0].url
    response = requests.get(image_url)
    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        
    chat_response = client.chat.completions.create(model="gpt-4-vision-preview", messages=[{"role": "user","content": [{"type": "text", "text": question}, {"type": "image_url","image_url": {"url": image_url,},},],}],max_tokens=300,)
    answer = chat_response.choices[0].message.content
    
    return image, answer

demo = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(label="Your API Key", type="password"), 
        gr.Radio(["A group of people are crowded around in a living room talking to one another. A man in the foreground introduces two individuals one appears to be a regular human male the other appears to be an animal. What is unusual about this description?", \
            "A woman is waiting to get on the elevator. But the people in the elevator are on fire. Where can this event take place?"], label="Example Question"),
        gr.Textbox(label="Question")
    ],
    outputs=[
        gr.Image(type="pil", label="Image Generated by DALL·E 3", image_mode="fixed", width=768, height=768),
        gr.Textbox(label="Answer")],
    title="Chain of Images for Intuitively Reasoning"
)

if __name__ == "__main__":
    demo.launch(show_api=True)