I wanted to train ET-Track(a nice video object tracker). which is based on Ocean(another video object tracker).
the problem is they did not share any code for training it before. now we have the code(changing codes from Ocean) but we have a big problem with it. we have a random shift when trying to crop images for our template-image. this is an example:
as you can see the search2 image which the model will train with it have a shift in it. I can figure out why I have this shift. would you help me please?
(template,search,out_label,reg_label,reg_weight,bbox,) = dataset[0]
x1, y1, x2, y2 = map(int, bbox)
search = search.transpose((1, 2, 0)).astype(np.uint8)
search = cv2.rectangle((search) , (x1, y1), (x2, y2), (200, 100, 150),1)
reg_weight = cv2.cvtColor(reg_weight.astype(np.uint8), cv2.COLOR_GRAY2RGB)
reg_weight = cv2.resize(reg_weight, (search.shape[1], search.shape[0]))
out_label = cv2.cvtColor(out_label.astype(np.uint8) * 255, cv2.COLOR_GRAY2RGB)
out_label = cv2.resize(out_label, (search.shape[1], search.shape[0]))
x1, y1, x2, y2 = map(int, bbox)
search2 = cv2.rectangle(search * reg_weight, (x1, y1), (x2, y2), (200, 100, 150))
cv2.imshow("search2", search2)
cv2.imshow("search",search)
cv2.imshow("out_label", out_label)
This is how these outputs are created.
and this is my dataset __getitem__:
if self.random_data:
template, search = self._get_pairs(index)
#choose 2 random image for search and template
template_image = cv2.imread(template[0].as_posix())
search_image = cv2.imread(search[0].as_posix())
# change bboxes format and pick the first one
template_target_bbox = self.yolo2ocean(template[1], template_image)
search_target_bbox = self.yolo2ocean(search[1], search_image)
_, template_image = crop_like_SiamFC(
template_image,
bbox=template_target_bbox,
exemplar_size=self.template_size,
instance_size=self.search_size,
)
_, search_image = crop_like_SiamFC(
search_image,
bbox=search_target_bbox,
exemplar_size=self.template_size,
instance_size=self.search_size + self.search_margin,
)
template_box = self._toBBox(template_image, template_target_bbox)
search_box = self._toBBox(search_image, search_target_bbox)
template, _, _ = self._augmentation(
template_image, template_box, self.template_size
)
search, bbox, dag_param = self._augmentation(
search_image, search_box, self.search_size, search=True
)
#No augment i have turned off all of them!
# from PIL image to numpy
template = np.array(template)
search = np.array(search)
out_label = self._dynamic_label([self.size, self.size], dag_param.shift)
reg_label, reg_weight = self.reg_label(bbox)
I think the self.reg_label is actual problem but I don't know why?
this is the function:
def reg_label(self, bbox):
"""
generate regression label
:param bbox: [x1, y1, x2, y2]
:return: [l, t, r, b]
"""
x1, y1, x2, y2 = bbox
l = self.grid_to_search_x - x1 # [17, 17]
t = self.grid_to_search_y - y1
r = x2 - self.grid_to_search_x
b = y2 - self.grid_to_search_y
l, t, r, b = map(lambda x: np.expand_dims(x, axis=-1), [l, t, r, b])
reg_label = np.concatenate((l, t, r, b), axis=-1) # [17, 17, 4]
reg_label_min = np.min(reg_label, axis=-1)
inds_nonzero = (reg_label_min > 0).astype(float)
return reg_label, inds_nonzero
def grids(self):
"""
each element of feature map on input search image
:return: H*W*2 (position for each element)
"""
sz = self.size #25
sz_x = sz // 2
sz_y = sz // 2
x, y = np.meshgrid(
np.arange(0, sz) - np.floor(float(sz_x)),
np.arange(0, sz) - np.floor(float(sz_y)),
)
self.grid_to_search = {}
self.stride = 8
self.grid_to_search_x = x * self.stride + self.search_size // 2
self.grid_to_search_y = y * self.stride + self.search_size // 2
Any idea would be helpful. Thanks a lot
