ChandraP12330's picture
Update app.py
5cb6bf2 verified
raw
history blame
1.73 kB
import streamlit as st
from transformers import pipeline
from PIL import Image
#import tensorflow
import torch
##BLIP
# Create the caption pipeline
initial_caption = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large")
# Display the image using Streamlit
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
if uploaded_image is not None:
image= Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_column_width=True)
initial_caption = initial_caption(image)
initial_caption = initial_caption[0]['generated_text']
##CLIP
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
scene_labels=['Arrest',
'Arson',
'Explosion',
'public fight',
'Normal',
'Road Accident',
'Robbery',
'Shooting',
'Stealing',
'Vandalism',
'Suspicious activity',
'Tailgating',
'Unauthorized entry',
'Protest/Demonstration',
'Drone suspicious activity',
'Fire/Smoke detection',
'Medical emergency',
'Suspicious package/object',
'Threatening',
'Attack',
'Shoplifting',
'burglary ',
'distress',
'assault']
image = Image.open(uploaded_image)
inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
context_raw= scene_labels[probs.argmax(-1)]
context= 'the image is depicting scene of '+ context_raw
##LLM
# Generate the caption
if st.button("Generate Caption"):
st.write()