Spaces:

louiecerv
/

nvidia-image-multimodal

Sleeping

File size: 6,565 Bytes

import os
import base64
import requests
import streamlit as st
import json

if "stream" not in st.session_state:
    st.session_state.stream = True

api_key = os.getenv("NVIDIA_VISION_API_KEY")
MODEL_ID = "meta/llama-3.2-90b-vision-instruct"
invoke_url = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions"

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_content(chunk):
    try:
        decoded_chunk = chunk.decode('utf-8')
        json_data = decoded_chunk.split('data: ')[1]
        parsed_data = json.loads(json_data)
        content = parsed_data['choices'][0]['delta']['content']
        return content
    except json.JSONDecodeError as e:
        #ignore the error
        return ""
    
  
def main():
    st.title("Multimodal Image Analysis with " + MODEL_ID)

    text = """Prof. Louie F. Cervantes, M. Eng. (Information Engineering)
    CCS 229 - Intelligent Systems
    Department of Computer Science
    College of Information and Communications Technology
    West Visayas State University
    """
    with st.expander("About"):
        st.text(text)

    st.write("Upload an image and select the image analysis task.")

    # File upload for image
    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
    if uploaded_image is not None:
        # Encode the uploaded image to base64
        base64_image = base64.b64encode(uploaded_image.getvalue()).decode('utf-8')

        # Display the uploaded image
        st.image(uploaded_image, caption="Uploaded Image",  use_container_width=True)

    # List of image analysis tasks
    analysis_tasks = [
        "Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.",
        "Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.",
        "Image Captioning: Generate a concise and accurate caption that describes the content of the image.",
        "Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'",
        "Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.",
        "Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.",
        "Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.",
        "Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.",
        "Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.",
        "Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases."
    ]

    # Task selection dropdown
    selected_task = st.selectbox("Select an image analysis task:", analysis_tasks)
    
    

    if st.button("Generate Response"):
        st.session_state.stream = st.checkbox("Begin streaming the AI response as soon as it is available.", value=True)    
        stream = st.session_state.stream

        if uploaded_image is None or selected_task == "":
            st.error("Please upload an image and select a task.")
            return

        else:
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Accept": "text/event-stream" if stream else "application/json"
            }

            # Prepare the multimodal prompt
            payload = {
                "model": MODEL_ID,
                "messages": [
                    {
                        "role": "user",
                        "content": f'{selected_task} <img src="data:image/png;base64,{base64_image}" />'
                    }
                ],
                "max_tokens": 512,
                "temperature": 1.00,
                "top_p": 1.00,
                "stream": stream  
            }

            with st.spinner("Processing..."):
                response = requests.post(
                    invoke_url,
                    headers=headers,
                    json=payload,
                    stream=stream  # Important for streaming
                )

                if stream:
                    response_container = st.empty()
                    content = ""
                    # Efficiently handle streaming response
                    for chunk in response.iter_lines(): 
                        if len(chunk) > 0:                             
                            # Decode the bytes object into a string
                            chunk_str = chunk.decode('utf-8')
                            # Remove the "data: " prefix
                            if chunk_str.startswith("data: "):
                                chunk_str = chunk_str[6:]
                            if chunk_str.strip() == "[DONE]":
                                break
                            # Check if the string is not empty
                            if chunk_str.strip() != "":
                                try:
                                    # Attempt to parse the string as JSON
                                    chunk_dict = json.loads(chunk_str)
                                    # Now you can access the 'choices' key
                                    content += chunk_dict['choices'][0]['delta']['content']
                                    response_container.markdown(content)
                                except json.JSONDecodeError as e:
                                    # Handle the error if the string is not valid JSON
                                    print(f"Error parsing JSON: {e}")
                                    print(f"Invalid JSON string: {chunk_str}")

                else:
                    try:
                        content = response.json()
                        content_string = content.get('choices', [{}])[0].get('message', {}).get('content', '')
                        st.write(f"AI Response: {content_string}")

                        st.success("Response generated!")

                    except Exception as e:
                        st.error(f"An error occurred: {e}")
 
if __name__ == "__main__":
    main()