Spaces:

louiecerv
/

nvidia-image-multimodal

Sleeping

App Files Files Community

louiecerv commited on Dec 23, 2024

Commit

bd07176

1 Parent(s): 16951c5

save changes

Browse files

Files changed (2) hide show

app.py +103 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import time
+import base64
+import requests
+import streamlit as st
+# Access the secret API key
+# if the app is running locally, you can set the API key as an environment variable
+api_key = os.getenv("NVIDIA_APP_KEY")
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+# stream the response
+stream = True
+headers = {
+  "Authorization": f"Bearer {api_key}",
+  "Accept": "text/event-stream" if stream else "application/json"
+}
+def main():
+    st.title("Multimodal using GPT 4 Turbo Model")
+    text = """Prof. Louie F. Cervantes, M. Eng. (Information Engineering)
+    CCS 229 - Intelligent Systems
+    Department of Computer Science
+    College of Information and Communications Technology
+    West Visayas State University
+    """
+    with st.expander("About"):
+        st.text(text)
+    st.write("Upload an image and select the image analysis task.")
+    # File upload for image
+    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
+    if uploaded_image is not None:
+        # Encode the uploaded image to base64
+        base64_image = base64.b64encode(uploaded_image.getvalue()).decode('utf-8')
+        # Display the uploaded image
+        st.image(uploaded_image, caption="Uploaded Image",  use_container_width=True)
+    # List of image analysis tasks
+    analysis_tasks = [
+        "Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.",
+        "Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.",
+        "Image Captioning: Generate a concise and accurate caption that describes the content of the image.",
+        "Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'",
+        "Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.",
+        "Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.",
+        "Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.",
+        "Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.",
+        "Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.",
+        "Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases."
+    ]
+    # Task selection dropdown
+    selected_task = st.selectbox("Select an image analysis task:", analysis_tasks)
+    # Button to generate response
+    if st.button("Generate Response"):
+        if uploaded_image is None or selected_task == "":
+            st.error("Please upload an image and sekect a task.")
+        else:
+            # Prepare the multimodal prompt
+            payload = {
+            "model": 'meta/llama-3.2-90b-vision-instruct',
+            "messages": [
+                {
+                "role": "user",
+                "content": f'{selected_task} <img src="data:image/png;base64,{base64_image}" />'
+                }
+            ],
+            "max_tokens": 512,
+            "temperature": 1.00,
+            "top_p": 1.00,
+            "stream": stream
+            }
+            with st.spinner("Processing..."):
+                try:
+                    # Generate response
+                    response = requests.post("https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions", headers=headers, json=payload)
+                    # Display the response if streaming
+                    if stream:
+                        for line in response.iter_lines():
+                            if line:
+                                st.write(line.decode("utf-8"))
+                    else:
+                        # Show the response content
+                        content = response.json()
+                        contentstring = content['choices'][0]['message']['content']
+                        st.write(f"AI Response: {contentstring}")
+                    st.success("Response generated!")
+                except Exception as e:
+                    st.error(f"An error occurred: {e}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit
2	+ openai