File size: 5,905 Bytes
8a4b9ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import cv2
import gradio as gr
import edge_tts
import tempfile
import numpy as np
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as transforms
from PIL import Image
from huggingface_hub import InferenceClient
class YoloDetector:
def __init__(self, weights_path, cfg_path, names_path):
self.net = cv2.dnn.readNet(weights_path, cfg_path)
self.classes = []
with open(names_path, "r") as f:
self.classes = [line.strip() for line in f.readlines()]
self.layer_names = self.net.getLayerNames()
self.output_layers = [self.layer_names[i[0] - 1] for i in self.net.getUnconnectedOutLayers()]
def detect_objects(self, frame):
height, width, channels = frame.shape
blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
self.net.setInput(blob)
outs = self.net.forward(self.output_layers)
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5:
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
font = cv2.FONT_HERSHEY_PLAIN
for i in range(len(boxes)):
if i in indexes:
x, y, w, h = boxes[i]
label = str(self.classes[class_ids[i]])
color = (0, 255, 0)
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
cv2.putText(frame, label, (x, y + 30), font, 3, color, 2)
return frame
class JarvisModels:
def __init__(self):
self.client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
self.detector = YoloDetector("yolov3.weights", "yolov3.cfg", "coco.names")
async def generate_model1(self, prompt):
generate_kwargs = dict(
temperature=0.6,
max_new_tokens=256,
top_p=0.95,
repetition_penalty=1,
do_sample=True,
seed=42,
)
formatted_prompt = system_instructions1 + prompt + "[JARVIS]"
stream = self.client1.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
output = ""
for response in stream:
output += response.token.text
communicate = edge_tts.Communicate(output)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
communicate.save(tmp_path)
return tmp_path
class FasterRCNNDetector:
def __init__(self):
self.model = fasterrcnn_resnet50_fpn(pretrained=True)
self.model.eval()
self.classes = [
"__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus",
"train", "truck", "boat", "traffic light", "fire hydrant", "N/A", "stop sign",
"parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "N/A", "backpack", "umbrella", "N/A", "N/A",
"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
"kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
"bottle", "N/A", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
"banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
"donut", "cake", "chair", "couch", "potted plant", "bed", "N/A", "dining table",
"N/A", "N/A", "toilet", "N/A", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "N/A", "book",
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]
def detect_objects(self, image):
image_pil = Image.fromarray(image)
transform = transforms.Compose([transforms.ToTensor()])
image_tensor = transform(image_pil).unsqueeze(0)
with torch.no_grad():
prediction = self.model(image_tensor)
boxes = prediction[0]['boxes']
labels = prediction[0]['labels']
scores = prediction[0]['scores']
for box, label, score in zip(boxes, labels, scores):
box = [int(i) for i in box]
cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
cv2.putText(image, self.classes[label], (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)
return image
def generate_response(frame):
jarvis = JarvisModels()
detector = FasterRCNNDetector()
frame_with_boxes = jarvis.detector.detect_objects(frame)
cv2.imwrite("temp.jpg", frame_with_boxes)
communicate = edge_tts.Communicate("Objects detected!")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
communicate.save(tmp_path)
return tmp_path
iface = gr.Webcam(gr.Video(label="Webcam", parameters=["fps=30"], is_streaming=True), preprocess=generate_response, postprocess=FasterRCNNDetector().detect_objects, show_loading=False)
gr.Interface(fn=iface, layout="vertical", capture_session=True).launch() |