import gradio as gr
import cv2
import threading
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import spaces

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Load the Hugging Face model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
@spaces.GPU
def query_the_image(query: str, image_data: bytes):
    try:
        image = Image.open(io.BytesIO(image_data)).convert("RGB")
        inputs = processor(image, query, return_tensors="pt").to(model.device)
        output = model.generate(**inputs)
        answer = processor.decode(output[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error: {e}"
@spaces.GPU
def get_frame():
    ret, frame = cap.read()
    if not ret:
        return None
    _, buffer = cv2.imencode('.jpg', frame)
    return buffer.tobytes()
@spaces.GPU
def process_image(prompt):
    frame_data = get_frame()
    if frame_data:
        return query_the_image(prompt, frame_data)
    return "Error capturing image"
@spaces.GPU
def video_feed():
    while True:
        ret, frame = cap.read()
        if ret:
            yield cv2.imencode('.jpg', frame)[1].tobytes()
        else:
            break

gui = gr.Blocks()
with gui:
    gr.Markdown("# Live Video AI Assistant")
    with gr.Row():
        video_component = gr.Video()
        threading.Thread(target=video_feed, daemon=True).start()
    prompt = gr.Textbox(label="Enter your safety policy for the AI to analyse each frame in real time")
    response = gr.Textbox(label="AI Response")
    btn = gr.Button("Ask")
    btn.click(process_image, inputs=prompt, outputs=response)

gui.launch()