File size: 6,565 Bytes
bd07176
 
 
 
d2e6583
bd07176
d2e6583
 
 
4880a2a
 
 
bd07176
 
 
 
 
 
d2e6583
 
 
 
 
 
 
 
 
 
 
bd07176
 
d2e6583
bd07176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2e6583
 
bd07176
 
d2e6583
 
 
bd07176
d2e6583
 
 
bd07176
d2e6583
 
 
 
 
bd07176
 
d2e6583
 
 
 
 
 
 
 
 
 
 
bd07176
 
 
d2e6583
 
 
 
 
 
 
 
 
 
 
 
 
e6bd973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2e6583
 
 
bd07176
d2e6583
 
 
 
bd07176
d2e6583
 
 
bd07176
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import base64
import requests
import streamlit as st
import json

if "stream" not in st.session_state:
    st.session_state.stream = True

api_key = os.getenv("NVIDIA_VISION_API_KEY")
MODEL_ID = "meta/llama-3.2-90b-vision-instruct"
invoke_url = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions"

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_content(chunk):
    try:
        decoded_chunk = chunk.decode('utf-8')
        json_data = decoded_chunk.split('data: ')[1]
        parsed_data = json.loads(json_data)
        content = parsed_data['choices'][0]['delta']['content']
        return content
    except json.JSONDecodeError as e:
        #ignore the error
        return ""
    
  
def main():
    st.title("Multimodal Image Analysis with " + MODEL_ID)

    text = """Prof. Louie F. Cervantes, M. Eng. (Information Engineering)
    CCS 229 - Intelligent Systems
    Department of Computer Science
    College of Information and Communications Technology
    West Visayas State University
    """
    with st.expander("About"):
        st.text(text)

    st.write("Upload an image and select the image analysis task.")

    # File upload for image
    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
    if uploaded_image is not None:
        # Encode the uploaded image to base64
        base64_image = base64.b64encode(uploaded_image.getvalue()).decode('utf-8')

        # Display the uploaded image
        st.image(uploaded_image, caption="Uploaded Image",  use_container_width=True)

    # List of image analysis tasks
    analysis_tasks = [
        "Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.",
        "Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.",
        "Image Captioning: Generate a concise and accurate caption that describes the content of the image.",
        "Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'",
        "Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.",
        "Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.",
        "Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.",
        "Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.",
        "Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.",
        "Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases."
    ]

    # Task selection dropdown
    selected_task = st.selectbox("Select an image analysis task:", analysis_tasks)
    
    

    if st.button("Generate Response"):
        st.session_state.stream = st.checkbox("Begin streaming the AI response as soon as it is available.", value=True)    
        stream = st.session_state.stream

        if uploaded_image is None or selected_task == "":
            st.error("Please upload an image and select a task.")
            return

        else:
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Accept": "text/event-stream" if stream else "application/json"
            }

            # Prepare the multimodal prompt
            payload = {
                "model": MODEL_ID,
                "messages": [
                    {
                        "role": "user",
                        "content": f'{selected_task} <img src="data:image/png;base64,{base64_image}" />'
                    }
                ],
                "max_tokens": 512,
                "temperature": 1.00,
                "top_p": 1.00,
                "stream": stream  
            }

            with st.spinner("Processing..."):
                response = requests.post(
                    invoke_url,
                    headers=headers,
                    json=payload,
                    stream=stream  # Important for streaming
                )

                if stream:
                    response_container = st.empty()
                    content = ""
                    # Efficiently handle streaming response
                    for chunk in response.iter_lines(): 
                        if len(chunk) > 0:                             
                            # Decode the bytes object into a string
                            chunk_str = chunk.decode('utf-8')
                            # Remove the "data: " prefix
                            if chunk_str.startswith("data: "):
                                chunk_str = chunk_str[6:]
                            if chunk_str.strip() == "[DONE]":
                                break
                            # Check if the string is not empty
                            if chunk_str.strip() != "":
                                try:
                                    # Attempt to parse the string as JSON
                                    chunk_dict = json.loads(chunk_str)
                                    # Now you can access the 'choices' key
                                    content += chunk_dict['choices'][0]['delta']['content']
                                    response_container.markdown(content)
                                except json.JSONDecodeError as e:
                                    # Handle the error if the string is not valid JSON
                                    print(f"Error parsing JSON: {e}")
                                    print(f"Invalid JSON string: {chunk_str}")

                else:
                    try:
                        content = response.json()
                        content_string = content.get('choices', [{}])[0].get('message', {}).get('content', '')
                        st.write(f"AI Response: {content_string}")

                        st.success("Response generated!")

                    except Exception as e:
                        st.error(f"An error occurred: {e}")
 
if __name__ == "__main__":
    main()