Spaces:
Sleeping
Sleeping
import os | |
import base64 | |
import requests | |
import streamlit as st | |
import json | |
import tempfile | |
if "stream" not in st.session_state: | |
st.session_state.stream = True | |
api_key = os.getenv("NVIDIA_VISION_API_KEY") | |
MODEL_ID = "meta/llama-3.2-90b-vision-instruct" | |
invoke_url = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions" | |
# Function to encode the image | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode('utf-8') | |
def main(): | |
st.title(f"Multimodal Image Analysis with {MODEL_ID}") | |
# Display about section | |
about_text = """Prof. Louie F. Cervantes, M. Eng. (Information Engineering) | |
CCS 229 - Intelligent Systems | |
Department of Computer Science | |
College of Information and Communications Technology | |
West Visayas State University | |
""" | |
with st.expander("About"): | |
st.text(about_text) | |
st.write("Upload an image and select the image analysis task.") | |
# File upload for image | |
uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"]) | |
temp_file_path = None | |
if uploaded_image is not None: | |
with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
temp_file.write(uploaded_image.getvalue()) | |
temp_file_path = temp_file.name | |
# Encode image as Base64 | |
with open(temp_file_path, "rb") as f: | |
base64_image = base64.b64encode(f.read()).decode() | |
# Display the uploaded image | |
st.image(uploaded_image, caption="Uploaded Image", use_container_width=True) | |
# List of image analysis tasks | |
analysis_tasks = [ | |
"Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.", | |
"Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.", | |
"Image Captioning: Generate a concise and accurate caption that describes the content of the image.", | |
"Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'", | |
"Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.", | |
"Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.", | |
"Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.", | |
"Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.", | |
"Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.", | |
"Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases." | |
] | |
# Task selection dropdown | |
selected_task = st.selectbox("Select an image analysis task:", [""] + analysis_tasks) | |
# Checkbox for streaming | |
stream = st.checkbox("Begin streaming the AI response as soon as it is available.", value=st.session_state.stream) | |
if st.button("Generate Response"): | |
if not api_key: | |
st.error("API key not found. Please set the NVIDIA_VISION_API_KEY environment variable.") | |
return | |
if uploaded_image is None: | |
st.error("Please upload an image.") | |
return | |
if not selected_task: | |
st.error("Please select an image analysis task.") | |
return | |
# Headers for the API call | |
headers = { | |
"Authorization": f"Bearer {api_key}", | |
"Accept": "text/event-stream" if stream else "application/json" | |
} | |
# Prepare the multimodal prompt | |
payload = { | |
"model": MODEL_ID, | |
"messages": [ | |
{ | |
"role": "user", | |
"content": f'{selected_task} <img src="data:image/png;base64,{base64_image}" />' | |
} | |
], | |
"max_tokens": 512, | |
"temperature": 1.0, | |
"top_p": 1.0, | |
"stream": stream | |
} | |
try: | |
with st.spinner("Processing..."): | |
response = requests.post( | |
invoke_url, | |
headers=headers, | |
json=payload, | |
stream=stream | |
) | |
response.raise_for_status() # Raise exception for HTTP errors | |
if stream: | |
# Handle streaming response | |
response_container = st.empty() | |
content = "" | |
for chunk in response.iter_lines(decode_unicode=True): | |
if chunk: | |
if "[DONE]" in chunk: | |
# Handle the end chunk | |
st.write("Response generation complete.") | |
break | |
# Check if the chunk is a JSON string | |
elif chunk.startswith("data:"): | |
chunk = chunk[5:].strip() # Remove the "data:" prefix | |
try: | |
if len(chunk) > 0: | |
chunk_dict = json.loads(chunk) | |
if "choices" in chunk_dict and chunk_dict["choices"]: | |
delta_content = chunk_dict["choices"][0]["delta"]["content"] | |
content += delta_content | |
response_container.write(content) | |
except json.JSONDecodeError as e: | |
st.error(f"Error parsing JSON: {e}") | |
else: | |
# Handle non-streaming response | |
content = response.json() | |
content_string = content.get("choices", [{}])[0].get("message", {}).get("content", "") | |
st.write(f"AI Response: {content_string}") | |
st.success("Response generated!") | |
except requests.exceptions.RequestException as e: | |
st.error(f"An error occurred while making the API call: {e}") | |
except Exception as e: | |
st.error(f"An unexpected error occurred: {e}") | |
finally: | |
# Clean up temporary file | |
if temp_file_path and os.path.exists(temp_file_path): | |
os.remove(temp_file_path) | |
if __name__ == "__main__": | |
main() | |