File size: 3,704 Bytes
832b0c8
 
 
49bd390
5853ee7
832b0c8
5cb6bf2
832b0c8
1a549b1
832b0c8
 
 
 
 
150c987
832b0c8
418d56e
 
5cb6bf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bacdfba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cb6bf2
832b0c8
 
bacdfba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import streamlit as st
from transformers import pipeline
from PIL import Image
#import tensorflow
import torch

##BLIP
# Create the caption pipeline
initial_caption_pipe = pipeline('image-to-text', model="Salesforce/blip-image-captioning-large")

# Display the image using Streamlit
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
if uploaded_image is not None:
    image= Image.open(uploaded_image)
    st.image(image, caption="Uploaded Image", use_column_width=True)

image = Image.open(uploaded_image)
initial_caption = initial_caption_pipe(image)
initial_caption = initial_caption[0]['generated_text']

##CLIP
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)
scene_labels=['Arrest',
 'Arson',
 'Explosion',
 'public fight',
 'Normal',
 'Road Accident',
 'Robbery',
 'Shooting',
 'Stealing',
 'Vandalism',
 'Suspicious activity',
 'Tailgating',
 'Unauthorized entry',
 'Protest/Demonstration',
 'Drone suspicious activity',
 'Fire/Smoke detection',
 'Medical emergency',
 'Suspicious package/object',
 'Threatening',
 'Attack',
 'Shoplifting',
 'burglary ',
 'distress',
 'assault']
image = Image.open(uploaded_image)
inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
context_raw= scene_labels[probs.argmax(-1)]
context= 'the image is depicting scene of '+ context_raw

##LLM
GOOGLE_API_KEY = st.text_input("Please enter your GOOGLE GEMINI API KEY", type="password")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro-latest", google_api_key=GOOGLE_API_KEY, temperature=0.2, top_p=1, top_k=1,
                             safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,

    },
      )
template="""You are an advanced image captioning AI assistant for surveillance related images.
Your task is to refine and improve an initial image caption using relevant contextual information provided.
You will receive two inputs:
Input 1: {initial_caption} - This is the initial caption for the image, most likely grammatically incorrect
and incomplete sentence, generated by a separate not so good image captioning model.
Input 2: {context} - This is the contextual information that provides more details about the background
Your goal is to take the initial caption and the additional context, and produce a new, refined caption that
incorporates the contextual details.
Please do not speculate things which are not provided. The final caption should be grammatically correct.
Please output only the final caption."""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["initial_caption", "context"],
)

prompt=prompt_template.format(initial_caption=initial_caption, context=context)
response = llm.invoke(prompt)
final_caption = response.content

# Generate the caption
if st.button("Generate Caption"):
    st.write(final_caption)