import gradio as gr from transformers import AutoModel, AutoProcessor import torch import cv2 # Load the model and processor from Hugging Face Hub model_name = "OpenGVLab/InternVideo2_5_Chat_8B" # Replace with the correct model name model = AutoModel.from_pretrained(model_name,trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_name,trust_remote_code=True) def predict(video_path): # Load the video video = cv2.VideoCapture(video_path) frames = [] while True: ret, frame = video.read() if not ret: break frames.append(frame) video.release() # Preprocess the frames inputs = processor(frames, return_tensors="pt") # Perform inference with torch.no_grad(): outputs = model(**inputs) # Process the outputs (replace this with your actual logic) prediction = "Hello (Example Prediction)" return prediction # Create Gradio interface iface = gr.Interface( fn=predict, inputs=gr.Video(label="Upload Video"), outputs=gr.Textbox(label="Prediction"), title="Indian Sign Language Recognition", description="Upload a video to recognize Indian Sign Language gestures.", ) # Launch the interface iface.launch()