I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.
After the training is complete, the agent is only taking the action as the same hour even if different states are inputted.
states structure : We are considering last 7 days hour-response pairs of the user.
Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0 [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]
Hyperparameters:
duration = 1 # @param {type:"integer"}
discount = 0.99 # @param {type:"number"}
state_size = 7 # @param {type:"integer"}
num_episodes = 1 # @param {type:"integer"}
learning_rate = 1e-5 # @param {type:"number"}
num_eval_episodes = 100 # @param {type:"integer"}
replay_buffer_max_length = 100000 # @param {type:"integer"}
initial_collect_steps = 100 # @param {type:"integer"}
batch_size = 10 # @param {type:"integer"}
num_iterations = 1000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
log_interval = 500 # @param {type:"integer"}
eval_interval = 500 # @param {type:"integer"}
Below is the code of NotifEnv class:
class NotifEnv(py_environment.PyEnvironment):
def __init__(self, duration, discount, state_size):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
self._state_spec = array_spec.BoundedArraySpec(
shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
self._discount_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
self.step_count = 0
self.duration = duration
self.state_size = state_size
self.discount = discount
self._episode_ended = False
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
def observation_spec(self):
"""Return state_spec."""
return self._state_spec
def action_spec(self):
"""Return action_spec."""
return self._action_spec
def _reset(self):
"""Return initial_time_step."""
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
self._episode_ended = False
self.step_count = 0
return ts.restart(np.array(self.state, dtype=np.float32))
def _step(self, action):
"""Apply action and return new time_step."""
if action<0 or action>23 :
raise ValueError('`action` should be between 0 and 23.')
self.step_count += 1
curr_reward = get_reward(action)
self.state[:-2] = self.state[2:]
self.state[-2:] = [action+1, curr_reward]
if self.step_count >= duration :
self.episode_ended = True
self.step_count = 0
return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)
else :
return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)
Reward function (reward only given if hour of day < 12) else penalise the agent:
def get_reward(action):
if action<12 :
return 1
else :
return -0.3
Model Architecture :
train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
Training :
fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
Initialising Replay Buffer :
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=20) # replay_buffer_max_length
def collect_step(environment, policy, buffer):
time_step = environment.current_time_step()
print("time_step = "+str(time_step))
action_step = policy.action(time_step)
print("action_step = "+str(action_step))
next_time_step = environment.step(action_step.action)
print("next_time_step = "+str(next_time_step))
traj = trajectory.from_transition(time_step, action_step, next_time_step)
print("traj = "+str(traj))
print("\n \n")
# Add trajectory to the replay buffer
buffer.add_batch(traj)
# buffer.add_batch(traj)
def collect_data(env, policy, buffer, steps):
for _ in range(steps):
print("NUM : "+str(_))
collect_step(env, policy, buffer)
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)
Training Run :
iterator = iter(dataset)
agent.train = common.function(agent.train)
# Reset the train step
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
# collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
# print("x = "+str(x))
train_loss = agent.train(experience).loss
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
print(np.max(q_net.get_weights()[0]))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
I think that 1000 training steps could be on of the reason the agent is stuck with picking one action. RL agents need hundreds of thousands of iterations, and this could be even higher depending on the task and some neural networks hyperparameters, like the learning rate.
I suggest you to increase the
num_iterations
by a lot, and then check if something improves.