I am using openvino action recognition model to detect person's actions. I referred to the C++ Smart Classroom demo, but I have trouble parsing the output in python.
The following is my code. I can see the output shapes for all acnhors. How can I parse this to correct bounding box and class? Which output anchors should I consider as final output?
from openvino.inference_engine import IECore
import cv2
model = "./models/person-detection-action-recognition-0006.xml" # model xml path
out_blob_h1a1 = "ActionNet/action_heads/out_head_1_anchor_1"
out_blob_h2a1 = "ActionNet/action_heads/out_head_2_anchor_1"
out_blob_h2a2 = "ActionNet/action_heads/out_head_2_anchor_2"
out_blob_h2a3 = "ActionNet/action_heads/out_head_2_anchor_3"
out_blob_h2a4 = "ActionNet/action_heads/out_head_2_anchor_4"
out_blob_conf = "ActionNet/out_detection_conf"
out_blob_loc = "ActionNet/out_detection_loc"
cap = cv2.VideoCapture("./data/classroom.mp4")
def load_model(model):
# Getting the *.bin file location
model_bin = model[:-3]+"bin"
# Loading the Inference Engine API
ie = IECore()
# Loading IR files
net = ie.read_network(model=model, weights = model_bin)
# Loading the network to the inference engine
exec_net = ie.load_network(network=net, device_name="CPU")
print("IR successfully loaded into Inference Engine.")
return exec_net
exec_net = load_model(model)
def get_input_shape(net):
"""returns input shape of the given network"""
input_key = list(net.input_info.keys())[0]
input_shape = net.input_info[input_key].input_data.shape
return input_shape
def preprocessing(input_image, N, C, height, width):
"""
Given an image and desired height and width,
reshapes the image to that height and width
and brings color channel in front
"""
image = cv2.resize(input_image, (width, height))
image = image.transpose((2,0,1))
image = image.reshape(N, C, height, width)
return image
def sync_inference(exec_net, image):
input_key = list(exec_net.input_info.keys())[0]
output_key = list(exec_net.outputs.keys())[0]
result = exec_net.infer({input_key: image})
return result, list(exec_net.outputs.keys())
def run_model(frame):
n, c, h, w = get_input_shape(exec_net)
preprocessed_image = preprocessing(frame, n, c, h, w)
result_sync, output_keys = sync_inference(exec_net, image = preprocessed_image)
out_head_1_anchor_1 = result_sync[output_keys[0]]
out_head_2_anchor_1 = result_sync[output_keys[1]]
out_head_2_anchor_2 = result_sync[output_keys[2]]
out_head_2_anchor_3 = result_sync[output_keys[3]]
out_head_2_anchor_4 = result_sync[output_keys[4]]
out_detection_conf = result_sync[output_keys[5]]
out_detection_loc = result_sync[output_keys[6]]
print(out_head_1_anchor_1.shape)
print(out_head_2_anchor_1.shape)
print(out_head_2_anchor_2.shape)
print(out_head_2_anchor_3.shape)
print(out_head_2_anchor_4.shape)
print(out_detection_conf.shape)
print(out_detection_loc.shape)
print(" ")
def main():
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("stream end.. exiting")
break
frame = run_model(frame)
#cv2.imshow('frame', frame)
# wait for the key and come out of the loop
if cv2.waitKey(1) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
main()
It seems like you’re printing out the output layer shape of the model.
Refer to action_detector.hpp for the details of anchor.
For writing codes in Python, you may refer to action_recognition_demo and object_detection_demo.