streamlit-webrtc-example-experimental

Running

App Files Files Community

Pratyush101 commited on Dec 14, 2024

Commit

f68bda2

verified ·

1 Parent(s): ce9d171

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -23

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import streamlit as st
 from streamlit_webrtc import WebRtcMode, webrtc_streamer
 from sample_utils.turn import get_ice_servers
 import mediapipe as mp
 import os
 import time
@@ -20,16 +21,23 @@ st.title("Interactive Virtual Keyboard")
 st.subheader('''Turn on the webcam and use hand gestures to interact with the virtual keyboard.
 Use 'a' and 'd' from the keyboard to change the background.''')
-# Initialize MediaPipe Hand Detector
 mp_hands = mp.solutions.hands
 hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)
 mp_drawing = mp.solutions.drawing_utils
-# Define virtual keyboard layout
 keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
         ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
         ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]
 class Detection(NamedTuple):
     label: str
     score: float
@@ -37,7 +45,7 @@ class Detection(NamedTuple):
 result_queue: "queue.Queue[List[Detection]]" = queue.Queue()
-# Load background images
 listImg = os.listdir('model/street') if os.path.exists('model/street') else []
 if not listImg:
     st.error("Error: 'street' directory is missing or empty. Please add background images.")
@@ -48,48 +56,76 @@ else:
 indexImg = 0
 output_text = ""
 if "output_text" not in st.session_state:
     st.session_state["output_text"] = ""
-# Video Frame Callback
 def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
     global indexImg, output_text
     img = frame.to_ndarray(format="bgr24")
-    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    # Process the frame with MediaPipe
-    result = hands.process(img_rgb)
     detections = []
     if result.multi_hand_landmarks:
         for hand_landmarks in result.multi_hand_landmarks:
             mp_drawing.draw_landmarks(
-                img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                 mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=4),
                 mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
             )
-            # Extract bounding box for detection info
-            x_min, y_min = 1.0, 1.0
-            x_max, y_max = 0.0, 0.0
             for lm in hand_landmarks.landmark:
-                x_min = min(x_min, lm.x)
-                y_min = min(y_min, lm.y)
-                x_max = max(x_max, lm.x)
-                y_max = max(y_max, lm.y)
-            h, w, _ = img.shape
-            bbox = np.array([int(x_min * w), int(y_min * h), int((x_max - x_min) * w), int((y_max - y_min) * h)])
-            detections.append(Detection(label="Hand", score=1.0, box=bbox))
-        logger.info(f"Detected {len(detections)} hand(s).")
-    else:
-        logger.info("No hands detected.")
     result_queue.put(detections)
     st.session_state["output_text"] = output_text
-    return av.VideoFrame.from_ndarray(img, format="bgr24")
 # WebRTC Streamer
 webrtc_streamer(

 from streamlit_webrtc import WebRtcMode, webrtc_streamer
 from sample_utils.turn import get_ice_servers
 import mediapipe as mp
+from cvzone.SelfiSegmentationModule import SelfiSegmentation
 import os
 import time
 st.subheader('''Turn on the webcam and use hand gestures to interact with the virtual keyboard.
 Use 'a' and 'd' from the keyboard to change the background.''')
+# Initialize MediaPipe and Background Segmentor
 mp_hands = mp.solutions.hands
 hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)
 mp_drawing = mp.solutions.drawing_utils
+segmentor = SelfiSegmentation()
+# Virtual Keyboard Layout
 keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
         ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
         ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]
+class Button:
+    def __init__(self, pos, text, size=[100, 100]):
+        self.pos = pos
+        self.size = size
+        self.text = text
 class Detection(NamedTuple):
     label: str
     score: float
 result_queue: "queue.Queue[List[Detection]]" = queue.Queue()
+# Load Background Images
 listImg = os.listdir('model/street') if os.path.exists('model/street') else []
 if not listImg:
     st.error("Error: 'street' directory is missing or empty. Please add background images.")
 indexImg = 0
 output_text = ""
+prev_key_time = [time.time()] * 2
 if "output_text" not in st.session_state:
     st.session_state["output_text"] = ""
+# Video Frame Callback with Your Logic
 def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
     global indexImg, output_text
     img = frame.to_ndarray(format="bgr24")
+    imgOut = segmentor.removeBG(img, imgList[indexImg])
+    # Process frame using MediaPipe
+    result = hands.process(cv2.cvtColor(imgOut, cv2.COLOR_BGR2RGB))
+    buttonList = [Button([30 + col * 105, 30 + row * 120], key) for row, line in enumerate(keys) for col, key in enumerate(line)]
     detections = []
     if result.multi_hand_landmarks:
         for hand_landmarks in result.multi_hand_landmarks:
+            # Draw hand landmarks
             mp_drawing.draw_landmarks(
+                imgOut, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                 mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=4),
                 mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
             )
+            # Extract bounding box for each hand
+            h, w, _ = imgOut.shape
+            x_min, y_min = w, h
+            x_max, y_max = 0, 0
             for lm in hand_landmarks.landmark:
+                x, y = int(lm.x * w), int(lm.y * h)
+                x_min, y_min = min(x_min, x), min(y_min, y)
+                x_max, y_max = max(x_max, x), max(y_max, y)
+            bbox = [x_min, y_min, x_max - x_min, y_max - y_min]
+            detections.append(Detection(label="Hand", score=1.0, box=np.array(bbox)))
+            # Extract finger tip positions
+            x4, y4 = int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x * w), \
+                     int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y * h)
+            x8, y8 = int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * w), \
+                     int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * h)
+            # Distance Calculation
+            distance = np.sqrt((x8 - x4) ** 2 + (y8 - y4) ** 2)
+            click_threshold = 50
+            for button in buttonList:
+                x, y = button.pos
+                w, h = button.size
+                if x < x8 < x + w and y < y8 < y + h:
+                    cv2.rectangle(imgOut, button.pos, (x + w, y + h), (0, 255, 160), -1)
+                    cv2.putText(imgOut, button.text, (x + 20, y + 70), cv2.FONT_HERSHEY_PLAIN, 5, (255, 255, 255), 3)
+                    # Simulate key press if finger close enough
+                    if (distance / np.sqrt(bbox[2] ** 2 + bbox[3] ** 2)) * 100 < click_threshold:
+                        if time.time() - prev_key_time[0] > 2:
+                            prev_key_time[0] = time.time()
+                            if button.text != 'BS' and button.text != 'SPACE':
+                                output_text += button.text
+                            elif button.text == 'BS':
+                                output_text = output_text[:-1]
+                            else:
+                                output_text += ' '
     result_queue.put(detections)
     st.session_state["output_text"] = output_text
+    return av.VideoFrame.from_ndarray(imgOut, format="bgr24")
 # WebRTC Streamer
 webrtc_streamer(