streamlit-webrtc-example-experimental

Running

App Files Files Community

Pratyush101 commited on Dec 13, 2024

Commit

fca4a0b

verified ·

1 Parent(s): 7b77011

Update app.py

Browse files

I have modified the app.py and added AI keyboard code in it part 1 its previous version is working

Files changed (1) hide show

app.py +263 -154

app.py CHANGED Viewed

@@ -1,188 +1,297 @@
-import logging
-import queue
-from pathlib import Path
-from typing import List, NamedTuple
-import mediapipe as mp
-import av
-import cv2
-import numpy as np
-import streamlit as st
-from streamlit_webrtc import WebRtcMode, webrtc_streamer
-from sample_utils.turn import get_ice_servers
-from cvzone.HandTrackingModule import HandDetector
-from cvzone.SelfiSegmentationModule import SelfiSegmentation
-import time
-import os
-logger = logging.getLogger(__name__)
-st.title("Interactive Virtual Keyboard with Twilio Integration")
-st.info("Use your webcam to interact with the virtual keyboard via hand gestures.")
-class Button:
-    def __init__(self, pos, text, size=[100, 100]):
-        self.pos = pos
-        self.size = size
-        self.text = text
-# Initialize components
-detector = HandDetector(maxHands=1, detectionCon=0.8)
-# segmentor = SelfiSegmentation()
-# keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
-#         ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
-#         ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]
-# listImg = os.listdir('model/street')
-# imgList = [cv2.imread(f'model/street/{imgPath}') for imgPath in listImg]
-# indexImg = 0
-# # Function to process the video frame from the webcam
-# def process_video_frame(frame, detector, segmentor, imgList, indexImg, keys, session_state):
-#     # Convert the frame to a numpy array (BGR format)
 #     image = frame.to_ndarray(format="bgr24")
-#     # Remove background using SelfiSegmentation
-#     imgOut = segmentor.removeBG(image, imgList[indexImg])
-#     # Detect hands on the background-removed image
-#     hands, img = detector.findHands(imgOut, flipType=False)
-#     # Create a blank canvas for the keyboard
-#     keyboard_canvas = np.zeros_like(img)
-#     buttonList = []
-#     # Create buttons for the virtual keyboard based on the keys list
-#     for key in keys[0]:
-#         buttonList.append(Button([30 + keys[0].index(key) * 105, 30], key))
-#     for key in keys[1]:
-#         buttonList.append(Button([30 + keys[1].index(key) * 105, 150], key))
-#     for key in keys[2]:
-#         buttonList.append(Button([30 + keys[2].index(key) * 105, 260], key))
-#     # Draw the buttons on the keyboard canvas
-#     for button in buttonList:
-#         x, y = button.pos
-#         cv2.rectangle(keyboard_canvas, (x, y), (x + button.size[0], y + button.size[1]), (255, 255, 255), -1)
-#         cv2.putText(keyboard_canvas, button.text, (x + 20, y + 70), cv2.FONT_HERSHEY_PLAIN, 5, (0, 0, 0), 3)
-#     # Handle input and gestures from detected hands
-#     if hands:
-#         for hand in hands:
-#             lmList = hand["lmList"]
-#             if lmList:
-#                 # Get the coordinates of the index finger tip (landmark 8)
-#                 x8, y8 = lmList[8][0], lmList[8][1]
-#                 for button in buttonList:
-#                     bx, by = button.pos
-#                     bw, bh = button.size
-#                     # Check if the index finger is over a button
-#                     if bx < x8 < bx + bw and by < y8 < by + bh:
-#                         # Highlight the button and update the text
-#                         cv2.rectangle(img, (bx, by), (bx + bw, by + bh), (0, 255, 0), -1)
-#                         cv2.putText(img, button.text, (bx + 20, by + 70), cv2.FONT_HERSHEY_PLAIN, 5, (255, 255, 255), 3)
-#                         # Update the output text in session_state
-#                         session_state["output_text"] += button.text
-#     # Corrected return: Create a video frame from the ndarray image
-#     return av.VideoFrame.from_ndarray(img, format="bgr24")
-# Shared state for output text
-if "output_text" not in st.session_state:
-    st.session_state["output_text"] = ""
-class Detection(NamedTuple):
-    label: str
-    score: float
-    box: np.ndarray
-@st.cache_resource  # Cache label colors
-def generate_label_colors():
-    return np.random.uniform(0, 255, size=(2, 3))  # Two classes: Left and Right Hand
-COLORS = generate_label_colors()
-# Initialize MediaPipe Hands
-mp_hands = mp.solutions.hands
-detector = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
-# Session-specific caching
-result_queue: "queue.Queue[List[Detection]]" = queue.Queue()
-# Hand detection callback
-def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
-    image = frame.to_ndarray(format="bgr24")
-    h, w = image.shape[:2]
-    # Process image with MediaPipe Hands
-    results = detector.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-    detections = []
-    if results.multi_hand_landmarks:
-        for hand_landmarks, hand_class in zip(results.multi_hand_landmarks, results.multi_handedness):
-            # Extract bounding box
-            x_min, y_min = 1, 1
-            x_max, y_max = 0, 0
-            for lm in hand_landmarks.landmark:
-                x_min = min(x_min, lm.x)
-                y_min = min(y_min, lm.y)
-                x_max = max(x_max, lm.x)
-                y_max = max(y_max, lm.y)
-            # Scale bbox to image size
-            box = np.array([x_min * w, y_min * h, x_max * w, y_max * h]).astype("int")
-            # Label and score
-            label = hand_class.classification[0].label
-            score = hand_class.classification[0].score
-            detections.append(Detection(label=label, score=score, box=box))
-            # Draw bounding box and label
-            color = COLORS[0 if label == "Left" else 1]
-            cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), color, 2)
-            caption = f"{label}: {round(score * 100, 2)}%"
-            cv2.putText(
-                image,
-                caption,
-                (box[0], box[1] - 15 if box[1] - 15 > 15 else box[1] + 15),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.5,
-                color,
-                2,
-            )
-    # Put results in the queue
-    result_queue.put(detections)
-    return av.VideoFrame.from_ndarray(image, format="bgr24")
-webrtc_ctx = webrtc_streamer(
-    key="keyboard-demo",
     mode=WebRtcMode.SENDRECV,
-    rtc_configuration={
-        "iceServers": get_ice_servers(),
-        "iceTransportPolicy": "relay",
-    },
-    video_frame_callback=video_frame_callback,
     media_stream_constraints={"video": True, "audio": False},
     async_processing=True,
 )
-st.markdown("### Instructions")
-st.write(
-    """
-    1. Turn on your webcam using the checkbox above.
-    2. Use hand gestures to interact with the virtual keyboard.
-    """
-)

+# import logging
+# import queue
+# from pathlib import Path
+# from typing import List, NamedTuple
+# import mediapipe as mp
+# import av
+# import cv2
+# import numpy as np
+# import streamlit as st
+# from streamlit_webrtc import WebRtcMode, webrtc_streamer
+# from sample_utils.turn import get_ice_servers
+# from cvzone.HandTrackingModule import HandDetector
+# from cvzone.SelfiSegmentationModule import SelfiSegmentation
+# import time
+# import os
+# logger = logging.getLogger(__name__)
+# st.title("Interactive Virtual Keyboard with Twilio Integration")
+# st.info("Use your webcam to interact with the virtual keyboard via hand gestures.")
+# class Button:
+#     def __init__(self, pos, text, size=[100, 100]):
+#         self.pos = pos
+#         self.size = size
+#         self.text = text
+# # Initialize components
+# detector = HandDetector(maxHands=1, detectionCon=0.8)
+# # segmentor = SelfiSegmentation()
+# # keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
+# #         ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
+# #         ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]
+# # listImg = os.listdir('model/street')
+# # imgList = [cv2.imread(f'model/street/{imgPath}') for imgPath in listImg]
+# # indexImg = 0
+# # # Function to process the video frame from the webcam
+# # def process_video_frame(frame, detector, segmentor, imgList, indexImg, keys, session_state):
+# #     # Convert the frame to a numpy array (BGR format)
+# #     image = frame.to_ndarray(format="bgr24")
+# #     # Remove background using SelfiSegmentation
+# #     imgOut = segmentor.removeBG(image, imgList[indexImg])
+# #     # Detect hands on the background-removed image
+# #     hands, img = detector.findHands(imgOut, flipType=False)
+# #     # Create a blank canvas for the keyboard
+# #     keyboard_canvas = np.zeros_like(img)
+# #     buttonList = []
+# #     # Create buttons for the virtual keyboard based on the keys list
+# #     for key in keys[0]:
+# #         buttonList.append(Button([30 + keys[0].index(key) * 105, 30], key))
+# #     for key in keys[1]:
+# #         buttonList.append(Button([30 + keys[1].index(key) * 105, 150], key))
+# #     for key in keys[2]:
+# #         buttonList.append(Button([30 + keys[2].index(key) * 105, 260], key))
+# #     # Draw the buttons on the keyboard canvas
+# #     for button in buttonList:
+# #         x, y = button.pos
+# #         cv2.rectangle(keyboard_canvas, (x, y), (x + button.size[0], y + button.size[1]), (255, 255, 255), -1)
+# #         cv2.putText(keyboard_canvas, button.text, (x + 20, y + 70), cv2.FONT_HERSHEY_PLAIN, 5, (0, 0, 0), 3)
+# #     # Handle input and gestures from detected hands
+# #     if hands:
+# #         for hand in hands:
+# #             lmList = hand["lmList"]
+# #             if lmList:
+# #                 # Get the coordinates of the index finger tip (landmark 8)
+# #                 x8, y8 = lmList[8][0], lmList[8][1]
+# #                 for button in buttonList:
+# #                     bx, by = button.pos
+# #                     bw, bh = button.size
+# #                     # Check if the index finger is over a button
+# #                     if bx < x8 < bx + bw and by < y8 < by + bh:
+# #                         # Highlight the button and update the text
+# #                         cv2.rectangle(img, (bx, by), (bx + bw, by + bh), (0, 255, 0), -1)
+# #                         cv2.putText(img, button.text, (bx + 20, by + 70), cv2.FONT_HERSHEY_PLAIN, 5, (255, 255, 255), 3)
+# #                         # Update the output text in session_state
+# #                         session_state["output_text"] += button.text
+# #     # Corrected return: Create a video frame from the ndarray image
+# #     return av.VideoFrame.from_ndarray(img, format="bgr24")
+# # Shared state for output text
+# if "output_text" not in st.session_state:
+#     st.session_state["output_text"] = ""
+# class Detection(NamedTuple):
+#     label: str
+#     score: float
+#     box: np.ndarray
+# @st.cache_resource  # Cache label colors
+# def generate_label_colors():
+#     return np.random.uniform(0, 255, size=(2, 3))  # Two classes: Left and Right Hand
+# COLORS = generate_label_colors()
+# # Initialize MediaPipe Hands
+# mp_hands = mp.solutions.hands
+# detector = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
+# # Session-specific caching
+# result_queue: "queue.Queue[List[Detection]]" = queue.Queue()
+# # Hand detection callback
+# def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
 #     image = frame.to_ndarray(format="bgr24")
+#     h, w = image.shape[:2]
+#     # Process image with MediaPipe Hands
+#     results = detector.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+#     detections = []
+#     if results.multi_hand_landmarks:
+#         for hand_landmarks, hand_class in zip(results.multi_hand_landmarks, results.multi_handedness):
+#             # Extract bounding box
+#             x_min, y_min = 1, 1
+#             x_max, y_max = 0, 0
+#             for lm in hand_landmarks.landmark:
+#                 x_min = min(x_min, lm.x)
+#                 y_min = min(y_min, lm.y)
+#                 x_max = max(x_max, lm.x)
+#                 y_max = max(y_max, lm.y)
+#             # Scale bbox to image size
+#             box = np.array([x_min * w, y_min * h, x_max * w, y_max * h]).astype("int")
+#             # Label and score
+#             label = hand_class.classification[0].label
+#             score = hand_class.classification[0].score
+#             detections.append(Detection(label=label, score=score, box=box))
+#             # Draw bounding box and label
+#             color = COLORS[0 if label == "Left" else 1]
+#             cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), color, 2)
+#             caption = f"{label}: {round(score * 100, 2)}%"
+#             cv2.putText(
+#                 image,
+#                 caption,
+#                 (box[0], box[1] - 15 if box[1] - 15 > 15 else box[1] + 15),
+#                 cv2.FONT_HERSHEY_SIMPLEX,
+#                 0.5,
+#                 color,
+#                 2,
+#             )
+#     # Put results in the queue
+#     result_queue.put(detections)
+#     return av.VideoFrame.from_ndarray(image, format="bgr24")
+# webrtc_ctx = webrtc_streamer(
+#     key="keyboard-demo",
+#     mode=WebRtcMode.SENDRECV,
+#     rtc_configuration={
+#         "iceServers": get_ice_servers(),
+#         "iceTransportPolicy": "relay",
+#     },
+#     video_frame_callback=video_frame_callback,
+#     media_stream_constraints={"video": True, "audio": False},
+#     async_processing=True,
+# )
+# st.markdown("### Instructions")
+# st.write(
+#     """
+#     1. Turn on your webcam using the checkbox above.
+#     2. Use hand gestures to interact with the virtual keyboard.
+#     """
+# )
+import logging
+import cv2
+import numpy as np
+import streamlit as st
+from streamlit_webrtc import WebRtcMode, webrtc_streamer
+from cvzone.HandTrackingModule import HandDetector
+from cvzone.SelfiSegmentationModule import SelfiSegmentation
+import os
+import time
+from sample_utils.turn import get_ice_servers
+logger = logging.getLogger(__name__)
+# Streamlit settings
+st.set_page_config(page_title="Virtual Keyboard", layout="wide")
+st.title("Interactive Virtual Keyboard")
+st.subheader('''Turn on the webcam and use hand gestures to interact with the virtual keyboard.
+Use 'a' and 'd' from the keyboard to change the background.''')
+# Initialize modules
+detector = HandDetector(maxHands=1, detectionCon=0.8)
+segmentor = SelfiSegmentation()
+# Define virtual keyboard layout
+keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
+        ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
+        ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]
+class Button:
+    def __init__(self, pos, text, size=[100, 100]):
+        self.pos = pos
+        self.size = size
+        self.text = text
+listImg = os.listdir('street') if os.path.exists('street') else []
+if not listImg:
+    st.error("Error: 'street' directory is missing or empty. Please add background images.")
+    st.stop()
+else:
+    imgList = [cv2.imread(f'street/{imgPath}') for imgPath in listImg if cv2.imread(f'street/{imgPath}') is not None]
+indexImg = 0
+prev_key_time = [time.time()] * 2
+output_text = ""
+if "output_text" not in st.session_state:
+    st.session_state["output_text"] = ""
+def video_frame_callback(frame):
+    global indexImg, output_text
+    img = frame.to_ndarray(format="bgr24")
+    imgOut = segmentor.removeBG(img, imgList[indexImg])
+    hands, img = detector.findHands(imgOut, flipType=False)
+    keyboard_canvas = np.zeros_like(img)
+    buttonList = []
+    for key in keys[0]:
+        buttonList.append(Button([30 + keys[0].index(key) * 105, 30], key))
+    for key in keys[1]:
+        buttonList.append(Button([30 + keys[1].index(key) * 105, 150], key))
+    for key in keys[2]:
+        buttonList.append(Button([30 + keys[2].index(key) * 105, 260], key))
+    for i, hand in enumerate(hands):
+        lmList = hand['lmList']
+        if lmList:
+            x4, y4 = lmList[4][0], lmList[4][1]
+            x8, y8 = lmList[8][0], lmList[8][1]
+            distance = np.sqrt((x8 - x4) ** 2 + (y8 - y4) ** 2)
+            click_threshold = 10
+            for button in buttonList:
+                x, y = button.pos
+                w, h = button.size
+                if x < x8 < x + w and y < y8 < y + h:
+                    cv2.rectangle(img, button.pos, (x + w, y + h), (0, 255, 160), -1)
+                    cv2.putText(img, button.text, (x + 20, y + 70), cv2.FONT_HERSHEY_PLAIN, 5, (255, 255, 255), 3)
+                    if (distance / np.sqrt((hand['bbox'][2]) ** 2 + (hand['bbox'][3]) ** 2)) * 100 < click_threshold:
+                        if time.time() - prev_key_time[i] > 2:
+                            prev_key_time[i] = time.time()
+                            if button.text != 'BS' and button.text != 'SPACE':
+                                output_text += button.text
+                            elif button.text == 'BS':
+                                output_text = output_text[:-1]
+                            else:
+                                output_text += ' '
+    st.session_state["output_text"] = output_text
+    return frame.from_ndarray(img, format="bgr24")
+webrtc_streamer(
+    key="virtual-keyboard",
     mode=WebRtcMode.SENDRECV,
+    rtc_configuration={"iceServers": get_ice_servers(), "iceTransportPolicy": "relay"},
     media_stream_constraints={"video": True, "audio": False},
+    video_frame_callback=video_frame_callback,
     async_processing=True,
 )
+st.subheader("Output Text")
+st.text_area("Live Input:", value=st.session_state["output_text"], height=200)