|
import logging |
|
import queue |
|
from typing import List, NamedTuple |
|
import av |
|
import cv2 |
|
import numpy as np |
|
import streamlit as st |
|
from streamlit_webrtc import WebRtcMode, webrtc_streamer |
|
from sample_utils.turn import get_ice_servers |
|
import mediapipe as mp |
|
import os |
|
import time |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
st.set_page_config(page_title="Virtual Keyboard", page_icon="🏋️") |
|
st.title("Interactive Virtual Keyboard") |
|
st.subheader('''Turn on the webcam and use hand gestures to interact with the virtual keyboard. |
|
Use 'a' and 'd' from the keyboard to change the background.''') |
|
|
|
|
|
mp_hands = mp.solutions.hands |
|
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.5) |
|
mp_drawing = mp.solutions.drawing_utils |
|
|
|
|
|
keys = [["Q", "W", "E", "R", "T", "Y", "U", "I", "O", "P"], |
|
["A", "S", "D", "F", "G", "H", "J", "K", "L", ";"], |
|
["Z", "X", "C", "V", "B", "N", "M", ",", ".", "/"]] |
|
|
|
class Button: |
|
def __init__(self, pos, text, size=[100, 100]): |
|
self.pos = pos |
|
self.size = size |
|
self.text = text |
|
|
|
class Detection(NamedTuple): |
|
label: str |
|
score: float |
|
box: np.ndarray |
|
|
|
result_queue: "queue.Queue[List[Detection]]" = queue.Queue() |
|
|
|
|
|
indexImg = 0 |
|
output_text = "" |
|
prev_key_time = [time.time()] * 2 |
|
|
|
if "output_text" not in st.session_state: |
|
st.session_state["output_text"] = "" |
|
|
|
|
|
def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame: |
|
global indexImg, output_text |
|
|
|
img = frame.to_ndarray(format="bgr24") |
|
|
|
|
|
result = hands.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) |
|
|
|
|
|
|
|
buttonList = [] |
|
|
|
for key in keys[0]: |
|
buttonList.append(Button([30 + keys[0].index(key) * 105, 30], key)) |
|
for key in keys[1]: |
|
buttonList.append(Button([30 + keys[1].index(key) * 105, 150], key)) |
|
for key in keys[2]: |
|
buttonList.append(Button([30 + keys[2].index(key) * 105, 260], key)) |
|
|
|
|
|
buttonList.append(Button([90 + 10 * 100, 30], 'BS', size=[125, 100])) |
|
buttonList.append(Button([300, 370], 'SPACE', size=[500, 100])) |
|
for button in buttonList: |
|
x, y = button.pos |
|
w, h = button.size |
|
if x < x8 < x + w and y < y8 < y + h: |
|
cv2.rectangle(img, button.pos, (x + w, y + h), (0, 255, 160), -1) |
|
cv2.putText(img, button.text, (x + 20, y + 70), cv2.FONT_HERSHEY_PLAIN, 5, (255, 255, 255), 3) |
|
|
|
detections = [] |
|
if result.multi_hand_landmarks: |
|
for hand_landmarks in result.multi_hand_landmarks: |
|
|
|
mp_drawing.draw_landmarks( |
|
img, hand_landmarks, mp_hands.HAND_CONNECTIONS, |
|
mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=4), |
|
mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2) |
|
) |
|
|
|
|
|
h, w, _ = img.shape |
|
x_min, y_min = w, h |
|
x_max, y_max = 0, 0 |
|
for lm in hand_landmarks.landmark: |
|
x, y = int(lm.x * w), int(lm.y * h) |
|
x_min, y_min = min(x_min, x), min(y_min, y) |
|
x_max, y_max = max(x_max, x), max(y_max, y) |
|
|
|
bbox = [x_min, y_min, x_max - x_min, y_max - y_min] |
|
detections.append(Detection(label="Hand", score=0.5, box=np.array(bbox))) |
|
|
|
|
|
x4, y4 = int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x * w), int(hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y * h) |
|
x8, y8 = int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * w), int(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * h) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result_queue.put(detections) |
|
st.session_state["output_text"] = output_text |
|
return av.VideoFrame.from_ndarray(img, format="bgr24") |
|
|
|
|
|
webrtc_streamer( |
|
key="virtual-keyboard", |
|
mode=WebRtcMode.SENDRECV, |
|
rtc_configuration={"iceServers": get_ice_servers(), "iceTransportPolicy": "relay"}, |
|
media_stream_constraints={"video": True, "audio": False}, |
|
video_frame_callback=video_frame_callback, |
|
async_processing=True, |
|
) |