Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import cv2 | |
import threading | |
import torch | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from PIL import Image | |
import spaces | |
# Initialize the webcam | |
cap = cv2.VideoCapture(0) | |
# Load the Hugging Face model and processor | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu") | |
def query_the_image(query: str, image_data: bytes): | |
try: | |
image = Image.open(io.BytesIO(image_data)).convert("RGB") | |
inputs = processor(image, query, return_tensors="pt").to(model.device) | |
output = model.generate(**inputs) | |
answer = processor.decode(output[0], skip_special_tokens=True) | |
return answer | |
except Exception as e: | |
return f"Error: {e}" | |
def get_frame(): | |
ret, frame = cap.read() | |
if not ret: | |
return None | |
_, buffer = cv2.imencode('.jpg', frame) | |
return buffer.tobytes() | |
def process_image(prompt): | |
frame_data = get_frame() | |
if frame_data: | |
return query_the_image(prompt, frame_data) | |
return "Error capturing image" | |
def video_feed(): | |
while True: | |
ret, frame = cap.read() | |
if ret: | |
yield cv2.imencode('.jpg', frame)[1].tobytes() | |
else: | |
break | |
gui = gr.Blocks() | |
with gui: | |
gr.Markdown("# Live Video AI Assistant") | |
with gr.Row(): | |
video_component = gr.Video() | |
threading.Thread(target=video_feed, daemon=True).start() | |
prompt = gr.Textbox(label="Enter your safety policy for the AI to analyse each frame in real time") | |
response = gr.Textbox(label="AI Response") | |
btn = gr.Button("Ask") | |
btn.click(process_image, inputs=prompt, outputs=response) | |
gui.launch() | |