I wanted to set up a gym environment to work on my school RL project. The environment is structured as follows: an Action Space of 7, with 6 for 3D movement and 1 for placing a block at coordinates (x, y, z). A file with an array of block coordinates will be supplied for the objective. There are certain rules for placing the block. Should I include the objective array and the blocks placed previously in the observation space (both are arrays)? Or should I return the observation with all the block placed previously, along with the current position? If yes, how should I do it? My current code as follows:
def __init__(self):
self.objective = np.load('objective.npy') # Load the objective from file
self.min_x, self.max_x, self.min_y, self.max_y, self.min_z, self.max_z= objective_functions.calculate_objective_space(self.objective) # add padding to the objective space
self.model = CompositeBlock()
self.action_space = spaces.Discrete(7) # 7 discrete actions
self.observation_space = spaces.Box(
low=np.array([self.min_x, self.min_y, self.min_z,0,0]),
high=np.array([self.max_x, self.max_y, self.max_z,1,1]),
dtype=np.int16
)
self.reset()
def step(self, action):
reward = 0
done = False
# Movement in 3D space
# Clip the cursor position to stay within bounds
if action == 0:
self.cur_pos[1] += 1
self.cur_pos[1] = np.clip(self.cur_pos[1], self.min_y, self.max_y)
elif action == 1:
self.cur_pos[1] -= 1
self.cur_pos[1] = np.clip(self.cur_pos[1], self.min_y, self.max_y)
elif action == 2:
self.cur_pos[0] -= 1
self.cur_pos[0] = np.clip(self.cur_pos[0], self.min_x, self.max_x)
elif action == 3:
self.cur_pos[0] += 1
self.cur_pos[0] = np.clip(self.cur_pos[0], self.min_x, self.max_x)
elif action == 4:
self.cur_pos[2] += 1
self.cur_pos[2] = np.clip(self.cur_pos[2], self.min_z, self.max_z)
elif action == 5:
self.cur_pos[2] -= 1
self.cur_pos[2] = np.clip(self.cur_pos[2], self.min_z, self.max_z)
# Check if block placement is valid
if action == 6: # Place block
if building_rules(self.cur_pos, self.model.blocks): # Check if block placement is valid
self.model.add_block(self.cur_pos)
if(np.any([np.array_equal(self.cur_pos, arr) for arr in self.objective])): # Reward if block placement is objective
reward = 1
else: # Penalize if block placement is not objective
reward = -0.1
done = set(map(tuple, self.objective)).issubset(set(map(tuple, self.model.blocks))) # Check if all objective blocks are placed
self.observation = np.array(self.cur_pos)
return self.observation, reward, done, {}
def _get_info(self):
pass
def reset(self,seed = None):
# We need the following line to seed self.np_random
super().reset(seed=seed)
# Reset the agent location to (0,0,0)
self.model = CompositeBlock()
self.cur_pos= np.array([0, 0, 0])
self.observation = np.array(self.cur_pos)
return self.observation
def render(self):
# Render the environment (optional for Gym)
pass
`
I will need to train a model afterwards, which should be the best option for simplicity? Currently, my observation is an array with 5 elements consisting of the current position's x, y, and z coordinates, is_placed_before (0 or 1), and is_objective_block (0 or 1). I feel like the agent does not know where the objective blocks are. I wonder how to fit the objective blocks into the observation.