streamlit-webrtc-example-experimental

Running

File size: 5,980 Bytes

import logging
import queue
from typing import List, NamedTuple
import av
import cv2
import numpy as np
import streamlit as st
from streamlit_webrtc import WebRtcMode, webrtc_streamer
from sample_utils.turn import get_ice_servers
import mediapipe as mp
import os
import time

# Logger Setup
logger = logging.getLogger(__name__)

# Streamlit settings
st.set_page_config(page_title="Virtual Keyboard", page_icon="🏋️")
st.title("Interactive Virtual Keyboard")
st.subheader('''Turn on the webcam and use hand gestures to interact with the virtual keyboard.''')

# Initialize MediaPipe and Background Segmentor
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Virtual Keyboard Layout
keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"],
        ["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"],
        ["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]]


class Button:
    def __init__(self, pos, text, size=[100, 100]):
        self.pos = pos
        self.size = size
        self.text = text


class Detection(NamedTuple):
    label: str
    score: float
    box: np.ndarray


# Global variables
result_queue: "queue.Queue[List[Detection]]" = queue.Queue()
indexImg = 0
output_text = ""
prev_key_time = [time.time()] * 2

if "output_text" not in st.session_state:
    st.session_state["output_text"] = ""


# Video Frame Callback with Logic
def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
    global indexImg, output_text

    img = frame.to_ndarray(format="bgr24")
    result = hands.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    # Create the keyboard buttons
    buttonList = []
    h, w = img.shape[:2]
    key_width = int(0.07 * w)
    key_height = int(0.09 * h)
    font_scale = 0.0045 * w
    font_thickness = int(0.009 * h)

    for row, key_row in enumerate(keys):
        for col, key in enumerate(key_row):
            x = int(0.03 * w + col * (key_width + 5))
            y = int(0.03 * h + row * (key_height + 5))
            buttonList.append(Button([x, y], key, size=[key_width, key_height]))

    # Add special buttons for Backspace and Space
    buttonList.append(Button([int(0.9 * w), int(0.03 * h)], 'BS', size=[int(0.08 * w), key_height]))
    buttonList.append(Button([int(0.2 * w), int(0.4 * h)], 'SPACE', size=[int(0.6 * w), key_height]))

    # Draw Keyboard Buttons
    for button in buttonList:
        x, y = button.pos
        bw, bh = button.size
        cv2.rectangle(img, (x, y), (x + bw, y + bh), (200, 200, 200), -1)
        cv2.putText(img, button.text, (x + int(0.2 * bw), y + int(0.7 * bh)), cv2.FONT_HERSHEY_PLAIN, font_scale, (0, 0, 0), font_thickness)

    detections = []
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=4),
                mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
            )

            h, w, _ = img.shape
            x_min, y_min = w, h
            x_max, y_max = 0, 0
            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                x_min, y_min = min(x_min, x), min(y_min, y)
                x_max, y_max = max(x_max, x), max(y_max, y)

            bbox = [x_min, y_min, x_max - x_min, y_max - y_min]
            detections.append(Detection(label="Hand", score=0.5, box=np.array(bbox)))

            x4, y4 = int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x * w), int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y * h)
            x8, y8 = int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * w), int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * h)

            distance = np.sqrt((x8 - x4) ** 2 + (y8 - y4) ** 2)
            click_threshold = 0.2 * np.sqrt(bbox[2] ** 2 + bbox[3] ** 2)
            
            for button in buttonList:
                x, y = button.pos
                bw, bh = button.size
                if x < x8 < x + bw and y < y8 < y + bh:
                    cv2.rectangle(img, (x, y), (x + bw, y + bh), (0, 255, 160), -1)
                    cv2.putText(img, button.text, (x + int(0.2 * bw), y + int(0.7 * bh)), cv2.FONT_HERSHEY_PLAIN, font_scale, (255, 255, 255), font_thickness)

                    if distance < click_threshold:
                        if time.time() - prev_key_time[0] > 2:
                            prev_key_time[0] = time.time()
                            if button.text != 'BS' and button.text != 'SPACE':
                                output_text += button.text  # Append key to output text 
                            elif button.text == 'BS':
                                output_text = output_text[:-1]  # Remove last character
                            else:
                                output_text += ' '  # Add space

    # Draw a background rectangle for the output text
    text_x = int(0.05 * w)
    text_y = int(0.70 * h)
    text_width = int(0.9 * w)
    text_height = int(0.1 * h)
    cv2.rectangle(img, 
                  (text_x, text_y - text_height), 
                  (text_x + text_width, text_y), 
                  (100, 100, 100), 
                  -1)

    # Overlay the output text
    cv2.putText(img, output_text, (text_x + int(0.02 * w), text_y - int(0.02 * h)), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 255), 5)

    result_queue.put(detections)
    return av.VideoFrame.from_ndarray(img, format="bgr24")


# WebRTC Streamer
webrtc_streamer(
    key="virtual-keyboard",
    mode=WebRtcMode.SENDRECV,
    rtc_configuration={"iceServers": get_ice_servers(), "iceTransportPolicy": "relay"},
    media_stream_constraints={"video": True, "audio": False},
    video_frame_callback=video_frame_callback,
    async_processing=True,
)