import streamlit as st
from transformers import pipeline
from PIL import Image
#import tensorflow
import torch

##BLIP
# Create the caption pipeline
initial_caption = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large")

# Display the image using Streamlit
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
if uploaded_image is not None:
    image= Image.open(uploaded_image)
    st.image(image, caption="Uploaded Image", use_column_width=True)

initial_caption = initial_caption(image)
initial_caption = initial_caption[0]['generated_text']

##CLIP
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
scene_labels=['Arrest',
 'Arson',
 'Explosion',
 'public fight',
 'Normal',
 'Road Accident',
 'Robbery',
 'Shooting',
 'Stealing',
 'Vandalism',
 'Suspicious activity',
 'Tailgating',
 'Unauthorized entry',
 'Protest/Demonstration',
 'Drone suspicious activity',
 'Fire/Smoke detection',
 'Medical emergency',
 'Suspicious package/object',
 'Threatening',
 'Attack',
 'Shoplifting',
 'burglary ',
 'distress',
 'assault']
image = Image.open(uploaded_image)
inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
context_raw= scene_labels[probs.argmax(-1)]
context= 'the image is depicting scene of '+ context_raw

##LLM

# Generate the caption
if st.button("Generate Caption"):
    
    st.write()