|
import streamlit as st |
|
from transformers import pipeline |
|
from PIL import Image |
|
|
|
import torch |
|
|
|
|
|
|
|
initial_caption = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large") |
|
|
|
|
|
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) |
|
if uploaded_image is not None: |
|
image= Image.open(uploaded_image) |
|
st.image(image, caption="Uploaded Image", use_column_width=True) |
|
|
|
initial_caption = initial_caption(image) |
|
initial_caption = initial_caption[0]['generated_text'] |
|
|
|
|
|
from transformers import CLIPProcessor, CLIPModel |
|
model_id = "openai/clip-vit-large-patch14" |
|
processor = CLIPProcessor.from_pretrained(model_id) |
|
model = CLIPModel.from_pretrained(model_id) |
|
scene_labels=['Arrest', |
|
'Arson', |
|
'Explosion', |
|
'public fight', |
|
'Normal', |
|
'Road Accident', |
|
'Robbery', |
|
'Shooting', |
|
'Stealing', |
|
'Vandalism', |
|
'Suspicious activity', |
|
'Tailgating', |
|
'Unauthorized entry', |
|
'Protest/Demonstration', |
|
'Drone suspicious activity', |
|
'Fire/Smoke detection', |
|
'Medical emergency', |
|
'Suspicious package/object', |
|
'Threatening', |
|
'Attack', |
|
'Shoplifting', |
|
'burglary ', |
|
'distress', |
|
'assault'] |
|
image = Image.open(uploaded_image) |
|
inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True) |
|
outputs = model(**inputs) |
|
logits_per_image = outputs.logits_per_image |
|
probs = logits_per_image.softmax(dim=1) |
|
context_raw= scene_labels[probs.argmax(-1)] |
|
context= 'the image is depicting scene of '+ context_raw |
|
|
|
|
|
|
|
|
|
if st.button("Generate Caption"): |
|
|
|
st.write() |
|
|