MoveNet not being able to be accurate on real-time data

96 Views Asked by At

When running my MoveNet on live feed from Webcam the points are too high for my shoulders when the subject is showing only face and top of shoulders. When subject moves back, the keypoints are good for the shoulders but the eyes are too low and the arms are not fully extended to the wrist (they stop at the elbow). I am using the Macbook Pro 13in with the M2 chip. Here is my code:


import numpy as np
from matplotlib import pyplot as plt
import cv2 
import tensorflow as tf
EDGES = {
    (0, 1): 'm',
    (0, 2): 'c',
    (1, 3): 'm',
    (2, 4): 'c',
    (0, 5): 'm',
    (0, 6): 'c',
    (5, 7): 'm',
    (7, 9): 'm',
    (6, 8): 'c',
    (8, 10): 'c',
    (5, 6): 'y',
    (5, 11): 'm',
    (6, 12): 'c',
    (11, 12): 'y',
    (11, 13): 'm',
    (13, 15): 'm',
    (12, 14): 'c',
    (14, 16): 'c'
}#This line of code is used to define the edges, which are the connections between the keypoints


def draw_keypoints(frame, keypoints, confidence_threshold):
    y, x, c = frame.shape
    shaped = np.squeeze(np.multiply(keypoints, [y,x,1]))
    
    for kp in shaped:
        ky, kx, kp_conf = kp
        if kp_conf > confidence_threshold:
            cv2.circle(frame, (int(kx), int(ky)), 4, (0,255,0), -1) 

def draw_connections(frame, keypoints, edges, confidence_threshold):
    y, x, c = frame.shape
    shaped = np.squeeze(np.multiply(keypoints, [y,x,1]))
    
    for edge, color in edges.items():
        p1, p2 = edge
        y1, x1, c1 = shaped[p1]
        y2, x2, c2 = shaped[p2]
        
        if (c1 > confidence_threshold) & (c2 > confidence_threshold):      
            cv2.line(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)

def preprocess_image(frame):
    # Define the target size
    target_size = 256

    # Calculate the aspect ratio of the original frame
    orig_height, orig_width, _ = frame.shape
    aspect_ratio = orig_width / orig_height

    # Resize the frame
    if aspect_ratio >= 1:  # If width >= height
        new_width = target_size
        new_height = round(target_size / aspect_ratio)
    else:  # If height > width
        new_height = target_size
        new_width = round(target_size * aspect_ratio)
    frame = cv2.resize(frame, (new_width, new_height))

    # Pad the frame
    pad_top = (target_size - new_height) // 2
    pad_bottom = target_size - new_height - pad_top
    pad_left = (target_size - new_width) // 2
    pad_right = target_size - new_width - pad_left
    frame = cv2.copyMakeBorder(frame, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT)

    return frame




interpreter = tf.lite.Interpreter(model_path='lite-model_movenet_singlepose_thunder_3.tflite') #This line of code is used to load the model
interpreter.allocate_tensors() #This line of code is used to allocate memory for the model
img = any
cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    
    # Reshape image
    img = frame.copy()
    img = preprocess_image(img)
    # Convert to float32 and add an extra dimension for the batch size
    input_image = np.expand_dims(img.astype(np.float32), axis=0)

    
    # Setup input and output 
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    # Make predictions 
    interpreter.set_tensor(input_details[0]['index'], input_image)
    interpreter.invoke()
    keypoints_with_scores = interpreter.get_tensor(output_details[0]['index'])
    
    # Rendering 
    draw_connections(frame, keypoints_with_scores, EDGES, 0.1)
    draw_keypoints(frame, keypoints_with_scores, 0.1)
    
    cv2.imshow('MoveNet Lightning', frame)
    
    if cv2.waitKey(10) & 0xFF==ord('q'):
        break
        
cap.release()
cv2.destroyAllWindows()
plt.imshow(img) #This line of code is used to show the image
print(img.shape)

right_hand = keypoints_with_scores[0][0][9] #This line of code is used to get the right hand
left_hand = keypoints_with_scores[0][0][10] #This line of code is used to get the left hand
px_cordinates = np.array(left_hand[:2]*[720,1280]).astype(int) #This line of code is used to get the pixel cordinates




I have tried to change the drawing functions and the preprocessing function but but it sometimes moves the points to the top left of the screen and does not cover the body at all.

1

There are 1 best solutions below

0
On

I suggest you draw on the preprocessed image, not the original frame, because it is resized in the preprocessing function. The keypoints refer to the resized image.