import streamlit as st from transformers import pipeline from PIL import Image #import tensorflow import torch ##BLIP # Create the caption pipeline initial_caption = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large") # Display the image using Streamlit uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) if uploaded_image is not None: image= Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_column_width=True) initial_caption = initial_caption(image) initial_caption = initial_caption[0]['generated_text'] ##CLIP from transformers import CLIPProcessor, CLIPModel model_id = "openai/clip-vit-large-patch14" processor = CLIPProcessor.from_pretrained(model_id) model = CLIPModel.from_pretrained(model_id) scene_labels=['Arrest', 'Arson', 'Explosion', 'public fight', 'Normal', 'Road Accident', 'Robbery', 'Shooting', 'Stealing', 'Vandalism', 'Suspicious activity', 'Tailgating', 'Unauthorized entry', 'Protest/Demonstration', 'Drone suspicious activity', 'Fire/Smoke detection', 'Medical emergency', 'Suspicious package/object', 'Threatening', 'Attack', 'Shoplifting', 'burglary ', 'distress', 'assault'] image = Image.open(uploaded_image) inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities context_raw= scene_labels[probs.argmax(-1)] context= 'the image is depicting scene of '+ context_raw ##LLM # Generate the caption if st.button("Generate Caption"): st.write()