File size: 5,905 Bytes
8a4b9ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import cv2
import gradio as gr
import edge_tts
import tempfile
import numpy as np
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as transforms
from PIL import Image
from huggingface_hub import InferenceClient

class YoloDetector:
    def __init__(self, weights_path, cfg_path, names_path):
        self.net = cv2.dnn.readNet(weights_path, cfg_path)
        self.classes = []
        with open(names_path, "r") as f:
            self.classes = [line.strip() for line in f.readlines()]
        self.layer_names = self.net.getLayerNames()
        self.output_layers = [self.layer_names[i[0] - 1] for i in self.net.getUnconnectedOutLayers()]
    
    def detect_objects(self, frame):
        height, width, channels = frame.shape
        blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
        self.net.setInput(blob)
        outs = self.net.forward(self.output_layers)

        class_ids = []
        confidences = []
        boxes = []
        for out in outs:
            for detection in out:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5:
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    boxes.append([x, y, w, h])
                    confidences.append(float(confidence))
                    class_ids.append(class_id)

        indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
        font = cv2.FONT_HERSHEY_PLAIN
        for i in range(len(boxes)):
            if i in indexes:
                x, y, w, h = boxes[i]
                label = str(self.classes[class_ids[i]])
                color = (0, 255, 0)
                cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
                cv2.putText(frame, label, (x, y + 30), font, 3, color, 2)

        return frame

class JarvisModels:
    def __init__(self):
        self.client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
        self.detector = YoloDetector("yolov3.weights", "yolov3.cfg", "coco.names")
    
    async def generate_model1(self, prompt):
        generate_kwargs = dict(
            temperature=0.6,
            max_new_tokens=256,
            top_p=0.95,
            repetition_penalty=1,
            do_sample=True,
            seed=42,
        )
        formatted_prompt = system_instructions1 + prompt + "[JARVIS]"
        stream = self.client1.text_generation(
            formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
        output = ""
        for response in stream:
            output += response.token.text

        communicate = edge_tts.Communicate(output)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            tmp_path = tmp_file.name
            communicate.save(tmp_path)
        return tmp_path

class FasterRCNNDetector:
    def __init__(self):
        self.model = fasterrcnn_resnet50_fpn(pretrained=True)
        self.model.eval()
        self.classes = [
            "__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus",
            "train", "truck", "boat", "traffic light", "fire hydrant", "N/A", "stop sign",
            "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "N/A", "backpack", "umbrella", "N/A", "N/A",
            "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
            "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
            "bottle", "N/A", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
            "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
            "donut", "cake", "chair", "couch", "potted plant", "bed", "N/A", "dining table",
            "N/A", "N/A", "toilet", "N/A", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "N/A", "book",
            "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
        ]

    def detect_objects(self, image):
        image_pil = Image.fromarray(image)
        transform = transforms.Compose([transforms.ToTensor()])
        image_tensor = transform(image_pil).unsqueeze(0)
        
        with torch.no_grad():
            prediction = self.model(image_tensor)
        
        boxes = prediction[0]['boxes']
        labels = prediction[0]['labels']
        scores = prediction[0]['scores']
        
        for box, label, score in zip(boxes, labels, scores):
            box = [int(i) for i in box]
            cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
            cv2.putText(image, self.classes[label], (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)
        
        return image

def generate_response(frame):
    jarvis = JarvisModels()
    detector = FasterRCNNDetector()
    frame_with_boxes = jarvis.detector.detect_objects(frame)
    cv2.imwrite("temp.jpg", frame_with_boxes)
    communicate = edge_tts.Communicate("Objects detected!")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        communicate.save(tmp_path)
    return tmp_path

iface = gr.Webcam(gr.Video(label="Webcam", parameters=["fps=30"], is_streaming=True), preprocess=generate_response, postprocess=FasterRCNNDetector().detect_objects, show_loading=False)
gr.Interface(fn=iface, layout="vertical", capture_session=True).launch()