OpenVino anchor output Python

208 Views Asked by At

I am using openvino action recognition model to detect person's actions. I referred to the C++ Smart Classroom demo, but I have trouble parsing the output in python.

The following is my code. I can see the output shapes for all acnhors. How can I parse this to correct bounding box and class? Which output anchors should I consider as final output?

from openvino.inference_engine import IECore
import cv2
 
model = "./models/person-detection-action-recognition-0006.xml" # model xml path

out_blob_h1a1 = "ActionNet/action_heads/out_head_1_anchor_1"
out_blob_h2a1 = "ActionNet/action_heads/out_head_2_anchor_1"
out_blob_h2a2 = "ActionNet/action_heads/out_head_2_anchor_2"
out_blob_h2a3 = "ActionNet/action_heads/out_head_2_anchor_3"
out_blob_h2a4 = "ActionNet/action_heads/out_head_2_anchor_4"
out_blob_conf = "ActionNet/out_detection_conf" 
out_blob_loc = "ActionNet/out_detection_loc"

cap = cv2.VideoCapture("./data/classroom.mp4") 

def load_model(model):
    # Getting the *.bin file location
    model_bin = model[:-3]+"bin"
    # Loading the Inference Engine API
    ie = IECore()
    
    # Loading IR files    
    net = ie.read_network(model=model, weights = model_bin)

    # Loading the network to the inference engine
    exec_net = ie.load_network(network=net, device_name="CPU")
    print("IR successfully loaded into Inference Engine.")
    
    return exec_net

exec_net = load_model(model)

def get_input_shape(net):
    """returns input shape of the given network"""
    input_key = list(net.input_info.keys())[0]
    input_shape = net.input_info[input_key].input_data.shape    
    return input_shape

def preprocessing(input_image, N, C, height, width):
     """
     Given an image and desired height and width, 
     reshapes the image to that height and width
     and brings color channel in front
     """
     image = cv2.resize(input_image, (width, height))
     image = image.transpose((2,0,1))
     image = image.reshape(N, C, height, width)
     return image
 
def sync_inference(exec_net, image):
     input_key = list(exec_net.input_info.keys())[0]
     output_key = list(exec_net.outputs.keys())[0]
 
     result = exec_net.infer({input_key: image})
     return result, list(exec_net.outputs.keys())
 
 
def run_model(frame):   
    n, c, h, w = get_input_shape(exec_net)
    preprocessed_image = preprocessing(frame, n, c, h, w)        
    result_sync, output_keys = sync_inference(exec_net, image = preprocessed_image)
    out_head_1_anchor_1 = result_sync[output_keys[0]]
    out_head_2_anchor_1 = result_sync[output_keys[1]]
    out_head_2_anchor_2 = result_sync[output_keys[2]]
    out_head_2_anchor_3 = result_sync[output_keys[3]]
    out_head_2_anchor_4 = result_sync[output_keys[4]]       
    out_detection_conf = result_sync[output_keys[5]]     
    out_detection_loc = result_sync[output_keys[6]] 
 
    print(out_head_1_anchor_1.shape)     
    print(out_head_2_anchor_1.shape)
    print(out_head_2_anchor_2.shape)     
    print(out_head_2_anchor_3.shape)
    print(out_head_2_anchor_4.shape)     
    print(out_detection_conf.shape)
    print(out_detection_loc.shape)  
    print(" ")
 
 
def main():     
    while cap.isOpened():       
          ret, frame = cap.read()       
          if not ret:           
             print("stream end.. exiting")          
             break      
          frame = run_model(frame)      
          #cv2.imshow('frame', frame)
          # wait for the key and come out of the loop
          if cv2.waitKey(1) == ord('q'): 
             break

    cap.release()   
    cv2.destroyAllWindows()
 
main()

1

There are 1 best solutions below

4
On

It seems like you’re printing out the output layer shape of the model.

Refer to action_detector.hpp for the details of anchor.

For writing codes in Python, you may refer to action_recognition_demo and object_detection_demo.