|
|
|
@ -0,0 +1,252 @@
|
|
|
|
|
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
"""Main scripts to run gesture recognition."""
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import sys
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
import cv2
|
|
|
|
|
import mediapipe as mp
|
|
|
|
|
|
|
|
|
|
from picamera2 import Picamera2
|
|
|
|
|
|
|
|
|
|
from mediapipe.tasks import python
|
|
|
|
|
from mediapipe.tasks.python import vision
|
|
|
|
|
from mediapipe.framework.formats import landmark_pb2
|
|
|
|
|
mp_hands = mp.solutions.hands
|
|
|
|
|
mp_drawing = mp.solutions.drawing_utils
|
|
|
|
|
mp_drawing_styles = mp.solutions.drawing_styles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Global variables to calculate FPS
|
|
|
|
|
COUNTER, FPS = 0, 0
|
|
|
|
|
START_TIME = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(model: str, num_hands: int,
|
|
|
|
|
min_hand_detection_confidence: float,
|
|
|
|
|
min_hand_presence_confidence: float, min_tracking_confidence: float,
|
|
|
|
|
camera_id: int, width: int, height: int) -> None:
|
|
|
|
|
"""Continuously run inference on images acquired from the camera.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model: Name of the gesture recognition model bundle.
|
|
|
|
|
num_hands: Max number of hands can be detected by the recognizer.
|
|
|
|
|
min_hand_detection_confidence: The minimum confidence score for hand
|
|
|
|
|
detection to be considered successful.
|
|
|
|
|
min_hand_presence_confidence: The minimum confidence score of hand
|
|
|
|
|
presence score in the hand landmark detection.
|
|
|
|
|
min_tracking_confidence: The minimum confidence score for the hand
|
|
|
|
|
tracking to be considered successful.
|
|
|
|
|
camera_id: The camera id to be passed to OpenCV.
|
|
|
|
|
width: The width of the frame captured from the camera.
|
|
|
|
|
height: The height of the frame captured from the camera.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Start capturing video input from the camera
|
|
|
|
|
|
|
|
|
|
picam2 = Picamera2()
|
|
|
|
|
picam2.configure(picam2.create_preview_configuration(
|
|
|
|
|
main={"format": 'XRGB8888', "size": (1280, 720)}))
|
|
|
|
|
picam2.start()
|
|
|
|
|
|
|
|
|
|
# Visualization parameters
|
|
|
|
|
row_size = 50 # pixels
|
|
|
|
|
left_margin = 24 # pixels
|
|
|
|
|
text_color = (0, 0, 0) # black
|
|
|
|
|
font_size = 1
|
|
|
|
|
font_thickness = 1
|
|
|
|
|
fps_avg_frame_count = 10
|
|
|
|
|
|
|
|
|
|
# Label box parameters
|
|
|
|
|
label_text_color = (255, 255, 255) # white
|
|
|
|
|
label_font_size = 1
|
|
|
|
|
label_thickness = 2
|
|
|
|
|
|
|
|
|
|
recognition_frame = None
|
|
|
|
|
recognition_result_list = []
|
|
|
|
|
|
|
|
|
|
def save_result(result: vision.GestureRecognizerResult,
|
|
|
|
|
unused_output_image: mp.Image, timestamp_ms: int):
|
|
|
|
|
global FPS, COUNTER, START_TIME
|
|
|
|
|
|
|
|
|
|
# Calculate the FPS
|
|
|
|
|
if COUNTER % fps_avg_frame_count == 0:
|
|
|
|
|
FPS = fps_avg_frame_count / (time.time() - START_TIME)
|
|
|
|
|
START_TIME = time.time()
|
|
|
|
|
|
|
|
|
|
recognition_result_list.append(result)
|
|
|
|
|
COUNTER += 1
|
|
|
|
|
|
|
|
|
|
# Initialize the gesture recognizer model
|
|
|
|
|
base_options = python.BaseOptions(model_asset_path=model)
|
|
|
|
|
options = vision.GestureRecognizerOptions(base_options=base_options,
|
|
|
|
|
running_mode=vision.RunningMode.LIVE_STREAM,
|
|
|
|
|
num_hands=num_hands,
|
|
|
|
|
min_hand_detection_confidence=min_hand_detection_confidence,
|
|
|
|
|
min_hand_presence_confidence=min_hand_presence_confidence,
|
|
|
|
|
min_tracking_confidence=min_tracking_confidence,
|
|
|
|
|
result_callback=save_result)
|
|
|
|
|
recognizer = vision.GestureRecognizer.create_from_options(options)
|
|
|
|
|
|
|
|
|
|
# Continuously capture images from the camera and run inference
|
|
|
|
|
while True:
|
|
|
|
|
image = picam2.capture_array()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image = cv2.flip(cv2.rotate(image, cv2.ROTATE_180),1)
|
|
|
|
|
|
|
|
|
|
# Convert the image from BGR to RGB as required by the TFLite model.
|
|
|
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
|
|
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
|
|
|
|
|
|
|
|
|
|
# Run gesture recognizer using the model.
|
|
|
|
|
recognizer.recognize_async(mp_image, time.time_ns() // 1_000_000)
|
|
|
|
|
|
|
|
|
|
# Show the FPS
|
|
|
|
|
fps_text = 'FPS = {:.1f}'.format(FPS)
|
|
|
|
|
text_location = (left_margin, row_size)
|
|
|
|
|
current_frame = image
|
|
|
|
|
cv2.putText(current_frame, fps_text, text_location, cv2.FONT_HERSHEY_DUPLEX,
|
|
|
|
|
font_size, text_color, font_thickness, cv2.LINE_AA)
|
|
|
|
|
|
|
|
|
|
if recognition_result_list:
|
|
|
|
|
# Draw landmarks and write the text for each hand.
|
|
|
|
|
for hand_index, hand_landmarks in enumerate(
|
|
|
|
|
recognition_result_list[0].hand_landmarks):
|
|
|
|
|
# Calculate the bounding box of the hand
|
|
|
|
|
x_min = min([landmark.x for landmark in hand_landmarks])
|
|
|
|
|
y_min = min([landmark.y for landmark in hand_landmarks])
|
|
|
|
|
y_max = max([landmark.y for landmark in hand_landmarks])
|
|
|
|
|
|
|
|
|
|
# Convert normalized coordinates to pixel values
|
|
|
|
|
frame_height, frame_width = current_frame.shape[:2]
|
|
|
|
|
x_min_px = int(x_min * frame_width)
|
|
|
|
|
y_min_px = int(y_min * frame_height)
|
|
|
|
|
y_max_px = int(y_max * frame_height)
|
|
|
|
|
|
|
|
|
|
# Get gesture classification results
|
|
|
|
|
if recognition_result_list[0].gestures:
|
|
|
|
|
gesture = recognition_result_list[0].gestures[hand_index]
|
|
|
|
|
category_name = gesture[0].category_name
|
|
|
|
|
score = round(gesture[0].score, 2)
|
|
|
|
|
result_text = f'{category_name} ({score})'
|
|
|
|
|
|
|
|
|
|
# Compute text size
|
|
|
|
|
text_size = \
|
|
|
|
|
cv2.getTextSize(result_text, cv2.FONT_HERSHEY_DUPLEX, label_font_size,
|
|
|
|
|
label_thickness)[0]
|
|
|
|
|
text_width, text_height = text_size
|
|
|
|
|
|
|
|
|
|
# Calculate text position (above the hand)
|
|
|
|
|
text_x = x_min_px
|
|
|
|
|
text_y = y_min_px - 10 # Adjust this value as needed
|
|
|
|
|
|
|
|
|
|
# Make sure the text is within the frame boundaries
|
|
|
|
|
if text_y < 0:
|
|
|
|
|
text_y = y_max_px + text_height
|
|
|
|
|
|
|
|
|
|
# Draw the text
|
|
|
|
|
cv2.putText(current_frame, result_text, (text_x, text_y),
|
|
|
|
|
cv2.FONT_HERSHEY_DUPLEX, label_font_size,
|
|
|
|
|
label_text_color, label_thickness, cv2.LINE_AA)
|
|
|
|
|
|
|
|
|
|
# Draw hand landmarks on the frame
|
|
|
|
|
hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
|
|
|
|
|
hand_landmarks_proto.landmark.extend([
|
|
|
|
|
landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y,
|
|
|
|
|
z=landmark.z) for landmark in
|
|
|
|
|
hand_landmarks
|
|
|
|
|
])
|
|
|
|
|
mp_drawing.draw_landmarks(
|
|
|
|
|
current_frame,
|
|
|
|
|
hand_landmarks_proto,
|
|
|
|
|
mp_hands.HAND_CONNECTIONS,
|
|
|
|
|
mp_drawing_styles.get_default_hand_landmarks_style(),
|
|
|
|
|
mp_drawing_styles.get_default_hand_connections_style())
|
|
|
|
|
|
|
|
|
|
recognition_frame = current_frame
|
|
|
|
|
recognition_result_list.clear()
|
|
|
|
|
|
|
|
|
|
if recognition_frame is not None:
|
|
|
|
|
recognition_frame = cv2.cvtColor(recognition_frame, cv2.COLOR_RGB2BGR)
|
|
|
|
|
cv2.imshow('gesture_recognition', recognition_frame)
|
|
|
|
|
|
|
|
|
|
# Stop the program if the ESC key is pressed.
|
|
|
|
|
if cv2.waitKey(1) == 27:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
recognizer.close()
|
|
|
|
|
cap.release()
|
|
|
|
|
cv2.destroyAllWindows()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--model',
|
|
|
|
|
help='Name of gesture recognition model.',
|
|
|
|
|
required=False,
|
|
|
|
|
default='gesture_recognizer.task')
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--numHands',
|
|
|
|
|
help='Max number of hands that can be detected by the recognizer.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=1)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--minHandDetectionConfidence',
|
|
|
|
|
help='The minimum confidence score for hand detection to be considered '
|
|
|
|
|
'successful.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=0.5)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--minHandPresenceConfidence',
|
|
|
|
|
help='The minimum confidence score of hand presence score in the hand '
|
|
|
|
|
'landmark detection.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=0.5)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--minTrackingConfidence',
|
|
|
|
|
help='The minimum confidence score for the hand tracking to be '
|
|
|
|
|
'considered successful.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=0.5)
|
|
|
|
|
# Finding the camera ID can be very reliant on platform-dependent methods.
|
|
|
|
|
# One common approach is to use the fact that camera IDs are usually indexed sequentially by the OS, starting from 0.
|
|
|
|
|
# Here, we use OpenCV and create a VideoCapture object for each potential ID with 'cap = cv2.VideoCapture(i)'.
|
|
|
|
|
# If 'cap' is None or not 'cap.isOpened()', it indicates the camera ID is not available.
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--cameraId', help='Id of camera.', required=False, default=0)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--frameWidth',
|
|
|
|
|
help='Width of frame to capture from camera.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=640)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--frameHeight',
|
|
|
|
|
help='Height of frame to capture from camera.',
|
|
|
|
|
required=False,
|
|
|
|
|
default=480)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
run(args.model, int(args.numHands), args.minHandDetectionConfidence,
|
|
|
|
|
args.minHandPresenceConfidence, args.minTrackingConfidence,
|
|
|
|
|
int(args.cameraId), args.frameWidth, args.frameHeight)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|