File size: 3,942 Bytes
ebe3bf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2366517
ebe3bf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2366517
ebe3bf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2366517
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import io
import os
from openai import OpenAI
from langchain.tools import StructuredTool, Tool
from io import BytesIO
import requests
import json
from io import BytesIO
import base64
import chainlit as cl


def get_image_name():
    """
    We need to keep track of images we generate, so we can reference them later
    and display them correctly to our users.
    """
    image_count = cl.user_session.get("image_count")
    if image_count is None:
        image_count = 0
    else:
        image_count += 1

    cl.user_session.set("image_count", image_count)

    return f"image-{image_count}"


def _generate_image(prompt: str):
    """
    This function is used to generate an image from a text prompt using
    DALL-E 3.

    We use the OpenAI API to generate the image, and then store it in our
    user session so we can reference it later.
    """
    client = OpenAI(api_key=cl.user_session.get("api_key"))

    response = client.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size="1024x1024",
        quality="standard",
        n=1,
    )

    image_payload = requests.get(response.data[0].url, stream=True)

    image_bytes = BytesIO(image_payload.content)

    print(type(image_bytes))

    name = get_image_name()
    cl.user_session.set(name, image_bytes.getvalue())
    cl.user_session.set("generated_image", name)
    return name


def generate_image(prompt: str):
    image_name = _generate_image(prompt)
    return f"Here is {image_name}."


# this is our tool - which is what allows our agent to generate images in the first place!
# the `description` field is of utmost imporance as it is what the LLM "brain" uses to determine
# which tool to use for a given input.
generate_image_format = '{{"prompt": "prompt"}}'
generate_image_tool = Tool.from_function(
    func=generate_image,
    name="GenerateImage",
    description=f"Useful to create an image from a text prompt. Input should be a single string strictly in the following JSON format: {generate_image_format}",
    return_direct=True,
)


def gpt_vision_call(image_id: str):
    #cl.user_session.set("image_id", image_id)
    print("image_id", image_id)
    client = OpenAI(api_key=cl.user_session.get("api_key"))
    image_history = cl.user_session.get("image_history")
    stream = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=image_history,
        max_tokens=350,
        stream=False,
    )

    return stream

def handle_image_history(msg):
    image_history = cl.user_session.get("image_history")
    image_base64 = None
    image_base64 = process_images(msg)
    
    if image_base64:
        # add the image to the image history
        image_history.append(
        {
            "role": "user",
            "content": [
                    {"type": "text", "text": msg.content},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_base64}",
                            "detail": "low"
                        }
                    },
                ],
            }
        )
        cl.user_session.set("image_history", image_history)


def process_images(msg: cl.Message):
    # Processing images exclusively
    images = [file for file in msg.elements if "image" in file.mime]

    # Accessing the bytes of a specific image
    image_bytes = images[0].content # take the first image just for demo purposes
    
    # we need base64 encoded image
    image_base64 = base64.b64encode(image_bytes).decode('utf-8')
    return image_base64

describe_image_format = '{{"image_id": "image_id"}}'
describe_image_tool = Tool.from_function(
    func=gpt_vision_call,
    name="DescribeImage",
    description=f"Useful to describe an image. Input should be a single string strictly in the following JSON format: {describe_image_format}",
    return_direct=False,
)