import gradio as gr import cv2 import threading import torch from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import spaces # Initialize the webcam cap = cv2.VideoCapture(0) # Load the Hugging Face model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu") @spaces.GPU def query_the_image(query: str, image_data: bytes): try: image = Image.open(io.BytesIO(image_data)).convert("RGB") inputs = processor(image, query, return_tensors="pt").to(model.device) output = model.generate(**inputs) answer = processor.decode(output[0], skip_special_tokens=True) return answer except Exception as e: return f"Error: {e}" @spaces.GPU def get_frame(): ret, frame = cap.read() if not ret: return None _, buffer = cv2.imencode('.jpg', frame) return buffer.tobytes() @spaces.GPU def process_image(prompt): frame_data = get_frame() if frame_data: return query_the_image(prompt, frame_data) return "Error capturing image" @spaces.GPU def video_feed(): while True: ret, frame = cap.read() if ret: yield cv2.imencode('.jpg', frame)[1].tobytes() else: break gui = gr.Blocks() with gui: gr.Markdown("# Live Video AI Assistant") with gr.Row(): video_component = gr.Video() threading.Thread(target=video_feed, daemon=True).start() prompt = gr.Textbox(label="Enter your safety policy for the AI to analyse each frame in real time") response = gr.Textbox(label="AI Response") btn = gr.Button("Ask") btn.click(process_image, inputs=prompt, outputs=response) gui.launch()