Spaces:

ChandraP12330
/

Surveillance-Image-Caption-2024

Sleeping

App Files Files Community

Surveillance-Image-Caption-2024 / app.py

ChandraP12330

Update app.py

5cb6bf2 verified about 1 year ago

raw

history blame

1.73 kB

	import streamlit as st
	from transformers import pipeline
	from PIL import Image
	#import tensorflow
	import torch

	##BLIP
	# Create the caption pipeline
	initial_caption = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large")

	# Display the image using Streamlit
	uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
	if uploaded_image is not None:
	image= Image.open(uploaded_image)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	initial_caption = initial_caption(image)
	initial_caption = initial_caption[0]['generated_text']

	##CLIP
	from transformers import CLIPProcessor, CLIPModel
	model_id = "openai/clip-vit-large-patch14"
	processor = CLIPProcessor.from_pretrained(model_id)
	model = CLIPModel.from_pretrained(model_id)
	scene_labels=['Arrest',
	'Arson',
	'Explosion',
	'public fight',
	'Normal',
	'Road Accident',
	'Robbery',
	'Shooting',
	'Stealing',
	'Vandalism',
	'Suspicious activity',
	'Tailgating',
	'Unauthorized entry',
	'Protest/Demonstration',
	'Drone suspicious activity',
	'Fire/Smoke detection',
	'Medical emergency',
	'Suspicious package/object',
	'Threatening',
	'Attack',
	'Shoplifting',
	'burglary ',
	'distress',
	'assault']
	image = Image.open(uploaded_image)
	inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs)
	logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	context_raw= scene_labels[probs.argmax(-1)]
	context= 'the image is depicting scene of '+ context_raw

	##LLM

	# Generate the caption
	if st.button("Generate Caption"):

	st.write()