I'm trying to train an Agent to play Pong using Gym environment. During the learning process I collect the Reward at every episode and build the reward curve based on this data. My problem is - no progression in the learning process. I mean the reward curve should rise up, but it doesn't. The reward per episode in average is between -19 to - 20, where - 21 is the worst result.
I would like to ask an advise what should I amend in my code, because it looks like I'm missing something.
I've read a lot of topics with discussion about the best hyperparameters and also https://arxiv.org/pdf/1511.06581.pdf trying to figure out where is the problem. I tried to train agent using the basic DQN architecture firstly. I found info, that it is one of the basic algorithms and training potentially could take days-weeks. I decided to build more advanced architecture, based on Double Dueling DQN networks, because the training time should be much shorter in compare with the basic DQN algorithm.
Parameters: I tried to repeat the parameters and DQN architecture the same as in https://arxiv.org/pdf/1810.00123.pdf I don't understand why an author used consequent Conv2D layers with the stride. Stride destroys the spatial information, where the tiny "ball" with the size of one or two pixels is the most important information at the picture. Anyway I tried both options - with using the stride and without it.
Preprocessing: environment - tried all from the following: Pong-v4 / PongDeterministic-v4 / PongNoFrameskip-v4 image - grayscale and resized observation to (105, 80, 1). Then I cut the top of picture with the score and finally get (80, 80, 1) of np.float32 type.
stack for inference contains 4 frames (optionally I can make with frameskip parameter)
My current parameters:
NUM_EPISODES = 100000 # NUMBER OF EPISODES TO TRAIN
MAX_STEPS = 30000 # MAXIMAL AMOUNT OF STEPS PER EPISODE
FRAME_STACK_SIZE = 4 # num images in stack for inference FRAME_SIZE = (80, 80, FRAME_STACK_SIZE)
EXP_STACK_SIZE = 10000 # MAXIMAL NUMBER OF ELEMENTS IN MEMORY STACK EXP_MINIBATCH_SIZE = 1024 # MINIBATCH SIZE FOR EXPERIENCE REPLAY
GAMMA = 0.99 # DISCOUNTING PARAMETER LEARNING_RATE = 0.00025 # 0.00025
SKIP_FRAMES = 0 # if 3 = amout of states to skip - saves every 4th
My code is below:
env = gym.make('PongNoFrameskip-v4', render_mode="rgb_array", obs_type="grayscale")
env = gym.wrappers.ResizeObservation(env, (105, 80))
# env = gym.wrappers.ResizeObservation(env, (84, 84))
STATE_SHAPE = env.observation_space.shape
NUM_ACTIONS = env.action_space.n
ACTION_MEANING = env.unwrapped.get_action_meanings()
def state_preprocess(s):
s = s[17: -8 , : ] / 255 # ETALON s = s[34: -16, :] / 255
s = s.astype(np.float32)[..., 0]
s = s.round()
return s
class Exp_stack():
def __init__(self, max_size = 10, batch_size = 5, inf_pack_size = 4):
self.max_size = max_size
self.batch_size = batch_size
self.inf_pack_size = inf_pack_size
self.stack = list()
self.full = False
def add(self, new_row):
self.stack.append(new_row)
self.keep_max_stack_size() # optimize buffer
self.is_full() # set state
def is_full(self):
if len(self.stack) >= self.batch_size:
self.full = True
else:
self.full = False
def keep_max_stack_size(self):
if len(self.stack) > self.max_size:
self.stack = self.stack[- self.max_size :]
def reset(self):
self.stack = list()
self.full = False
def get_minibatch(self):
index_list = random.sample(range(0, len(self.stack)), self.batch_size)
minibatch = [self.stack[idx] for idx in index_list]
s_batch, action_batch, reward_batch, s1_batch, done_batch = map(np.asarray, zip(*minibatch))
s_batch = s_batch.astype(np.float32)
s1_batch = s1_batch.astype(np.float32)
return s_batch, action_batch, reward_batch, s1_batch, done_batch
class Image_stack():
def __init__(self, max_size = 4):
self.max_size = max_size
self.stack = list()
self.full = False
def add(self, new_row):
self.stack.append(new_row)
self.keep_max_stack_size() # optimize buffer
self.is_full() # set state
def is_full(self):
if len(self.stack) == self.max_size:
self.full = True
else:
self.full = False
def keep_max_stack_size(self):
if len(self.stack) > self.max_size:
self.stack = self.stack[-self.max_size : ]
def get_pack_of_s(self, forward_order = True):
tmp_list = [image[..., None] for image in [chain[0] for chain in self.stack]
]
if forward_order:
res = np.concatenate(tmp_list, axis = 2, dtype=np.float32)
else:
res = np.concatenate(tmp_list[::-1], axis = 2, dtype=np.float32)
return res
def get_pack_of_s1(self, forward_order = True):
tmp_list = [image[..., None] for image in [chain[3] for chain in self.stack]
]
if forward_order:
res = np.concatenate(tmp_list, axis = 2, dtype=np.float32)
else:
res = np.concatenate(tmp_list[::-1], axis = 2, dtype=np.float32)
return res
def prepared_to_save_in_memory(self):
s_pack = self.get_pack_of_s()
a_pack = self.stack[-1][1]
r_pack = self.stack[-1][2]
s1_pack = self.get_pack_of_s1()
terminated_pack = self.stack[-1][4]
return s_pack, a_pack, r_pack, s1_pack, terminated_pack
def reset(self):
self.stack = list()
self.full = False
class Model(tf.keras.Model):
def __init__(self, frame_size, num_actions):
super(Model, self).__init__()
self.frame_size = frame_size # inference size
self.num_actions = num_actions
self.conv1 = tf.keras.layers.Conv2D(filters=32, kernel_size=(8, 8), strides = 2, #strides = 4,
activation='relu',
input_shape = self.frame_size,
data_format = 'channels_last',
# dilation_rate = 3,
kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
bias_initializer=tf.keras.initializers.Zeros(),
# kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
)
self.conv2 = tf.keras.layers.Conv2D(filters=64, kernel_size=(4, 4), #strides = 2,
activation='relu',
input_shape = self.frame_size,
data_format = 'channels_last',
# dilation_rate = 3,
kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
bias_initializer=tf.keras.initializers.Zeros(),
# kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
)
self.conv3 = tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), #strides = 1,
activation='relu',
data_format = 'channels_last',
kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
bias_initializer=tf.keras.initializers.Zeros(),
# dilation_rate = 3,
# kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
)
self.flatten = tf.keras.layers.Flatten()
self.dense_V_stream = tf.keras.layers.Dense(
units=512,
activation="relu",
kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
bias_initializer=tf.keras.initializers.Zeros(),
)
self.V = tf.keras.layers.Dense(1)
self.dense_A_stream = tf.keras.layers.Dense(
units=512,
activation="relu",
kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
bias_initializer=tf.keras.initializers.Zeros(),
)
self.A = tf.keras.layers.Dense(self.num_actions)
def call(self, obs):
"""Forward pass of the neural network with some inputs."""
obs = self.conv1(obs)
obs = self.conv2(obs)
obs = self.conv3(obs)
obs = self.flatten(obs)
v = self.dense_V_stream(obs)
v = self.V(v)
a = self.dense_A_stream(obs)
a = self.A(a)
Q = v + tf.subtract(a, tf.reduce_mean(a, axis=1, keepdims=True))
return Q
class DQN:
def __init__(self, frame_size = (160, 160, 4), num_actions = 6, lr = 0.000025):
self.frame_size = frame_size # inference size
self.num_actions = num_actions
self.lr = lr
self.loss = tf.keras.losses.MeanSquaredError()
# self.optimizer = tf.keras.optimizers.SGD(learning_rate = self.lr) #, momentum=0.9, nesterov=True)
self.optimizer = tf.keras.optimizers.Adam(learning_rate = self.lr, clipnorm=10)
self.model = None
def build_model(self):
model = Model(self.frame_size, self.num_actions)
model.compile(loss = self.loss, optimizer = self.optimizer)
rows, cols, phi = self.frame_size
model.build(input_shape = (None, rows, cols, phi))
self.model = model
def evalQ_single(self, s):
inp = s[None, ...]
return self.model(inp).numpy()[0]
def evalQ_batch(self, s):
return self.model(s).numpy()
class Bot():
def __init__(self,
env = env,
num_episodes = 1000,
max_steps = 1000, # steps per episode
frame = Image_stack(4),
experience_stack = Exp_stack(max_size = 1000, batch_size = 16, inf_pack_size = 4) ,
dqn_main = None, # used for Predict next Action
dqn_target = None, # used for Evaluate Q_target
gamma = 0.99,
skip = 3
):
self.env = env
self.num_episodes = num_episodes
self.max_steps = max_steps
self.frame = frame
self.experience_stack = experience_stack
self.dqn_main = dqn_main
self.dqn_target = dqn_target
self.target_update()
self.gamma = gamma
self.skip = skip
self.current_epizode = 0
self.current_step = 0
self.tmp_q_target = None
def target_update(self):
weights = self.dqn_main.model.get_weights()
self.dqn_target.model.set_weights(weights)
def get_eps(self):
return 1.0 - ( self.current_epizode / self.num_episodes)**0.2
def get_r_s1_term_chain(self, a_):
s1_, r_, terminated_, truncated_, info_ = self.env.step(a_) # make a STEP
s1_ = state_preprocess(s1_)
self.current_step += 1
return r_, s1_, terminated_
def play_epoch_collect_data(self):
self.current_step = 0
self.frame.reset()
s = self.env.reset()[0] # RESET ENVIRONMENT
s = state_preprocess(s)
a = self.env.action_space.sample()
reward_per_epoch = 0
while self.current_step < self.max_steps:
reward_per_frame = 0
while not self.frame.full:
r, s1, terminated = self.get_r_s1_term_chain(a)
reward_per_frame += r
## SAVE IN MEMORY
if self.current_step % (self.skip + 1) == 0:
self.frame.add([s, a, r , s1, terminated]) # REWARD KOEFICIENT MULTIPLICATION
s = copy.deepcopy(s1)
if terminated:
self.frame.reset()
break
else:
prepared_pack = self.frame.prepared_to_save_in_memory() # Extract_pack from colected sars1 chains
self.experience_stack.add(prepared_pack) # Save extracted_pack into MEMORY
self.frame.reset()
reward_per_epoch += reward_per_frame
reward_per_frame = 0
if np.random.rand() < self.get_eps(): # get random action
a = self.env.action_space.sample()
# print('random')
else:
Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0]) # get vector of probabilities of ACTION
a = np.argmax(Q_s_sample) # Выбор действия по текущей политике
# print('by policy')
return reward_per_epoch
def train_on_collected_data(self, train_cycles = 1):
for i_cycle in range(train_cycles):
#################################################
# TRAIN Q FUNCTION on BINIBATCH
if self.experience_stack.full:
s_batch, action_batch, reward_batch, s1_batch, done_batch = self.experience_stack.get_minibatch()
Q_s = self.dqn_target.evalQ_batch(s_batch)
# print('Q_s', Q_s)
# print('action_batch', action_batch)
# next_q_values = self.dqn_target.evalQ_batch(s1_batch).max(axis=1) # DOUBLE DQN
next_q_values = self.dqn_target.evalQ_batch(s1_batch)[range(self.experience_stack.batch_size),np.argmax(self.dqn_main.evalQ_batch(s1_batch), axis=1)] # DOUBLE DUELLING DQN
Q_s[range(self.experience_stack.batch_size), action_batch] = reward_batch + (1-done_batch) * next_q_values * self.gamma
self.tmp_q_target = Q_s # just for case
# print('Q_s type', Q_s.dtype)
self.dqn_main.model.train_on_batch(s_batch, Q_s)
def play_game(self, play_steps = 1000):
env_sim = RecordVideo(self.env, './video', episode_trigger = lambda episode_number: True)
# env_sim = RecordVideo(self.env, './video', video_length = 1000, step_trigger=lambda x: x % 10 == 0)
self.current_step = 0
self.frame.reset()
s = env_sim.reset()[0] # RESET ENVIRONMENT
s = state_preprocess(s)
a = env_sim.action_space.sample()
reward_per_epoch = 0
while self.current_step < self.max_steps:
reward_per_frame = 0
while not self.frame.full:
s1, r, terminated, truncated, info = env_sim.step(a) # make a STEP
s1 = state_preprocess(s1)
self.current_step += 1
reward_per_frame += r
## SAVE IN MEMORY
if self.current_step % (self.skip + 1) == 0:
self.frame.add([s, a, r , s1, terminated]) # REWARD KOEFICIENT MULTIPLICATION
s = copy.deepcopy(s1)
if terminated:
self.frame.reset()
break
else:
prepared_pack = self.frame.prepared_to_save_in_memory() # Extract_pack from colected sars1 chains
self.experience_stack.add(prepared_pack) # Save extracted_pack into MEMORY
self.frame.reset()
reward_per_epoch += reward_per_frame
reward_per_frame = 0
Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0]) # get vector of probabilities of ACTION
a = np.argmax(Q_s_sample) # Выбор действия по текущей политике
# print('by policy')
env_sim.close()
show_video()
return reward_per_epoch
def play_and_train_epoch(self, train_cycles = 1, train_every = 4):
pack_counter = 0
self.current_step = 0
self.frame.reset()
s = self.env.reset()[0] # RESET ENVIRONMENT
s = state_preprocess(s)
a = self.env.action_space.sample()
reward_per_epoch = 0
while self.current_step < self.max_steps:
reward_per_frame = 0
while not self.frame.full:
r, s1, terminated = self.get_r_s1_term_chain(a)
reward_per_frame += r
## SAVE IN MEMORY
if self.current_step % (self.skip + 1) == 0:
self.frame.add([s, a, r , s1, terminated]) # REWARD KOEFICIENT MULTIPLICATION
s = copy.deepcopy(s1)
if terminated:
self.frame.reset()
break
else:
prepared_pack = self.frame.prepared_to_save_in_memory() # Extract_pack from colected sars1 chains
pack_counter += 1
self.experience_stack.add(prepared_pack) # Save extracted_pack into MEMORY
self.frame.reset()
reward_per_epoch += reward_per_frame
reward_per_frame = 0
if np.random.rand() < self.get_eps(): # get random action
a = self.env.action_space.sample()
# print('random')
else:
Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0]) # get vector of probabilities of ACTION
a = np.argmax(Q_s_sample) # Выбор действия по текущей политике
# print('by policy')
###################################
# train_cycles = len(self.experience_stack.stack) // self.experience_stack.batch_size
if self.experience_stack.full and (pack_counter % train_every == 0):
# print('trained')
self.train_on_collected_data(train_cycles = train_cycles) # TRAIN
###################################
return reward_per_epoch
# COLLECT and TRAIN SEPARATELY
def train(self, train_cycles = 1, REWARD_AVERAGE_WINDOW = 10, weights_update_per_epizode = 1):
reward_history = list() # save results when training
reward_history_average = list() # суммарные награды по эпизодам (среднее по окну)
just_play_reward_list = list() # save results when testing - without random actions - on video
for self.current_epizode in range(self.num_episodes):
# COLLECT
# print('Collecting...')
reward_per_epoch = self.play_epoch_collect_data()
reward_history.append(reward_per_epoch)
# TRAIN
# print('Training...')
self.train_on_collected_data(train_cycles = train_cycles)
# TArget NN weights update
if self.current_epizode % weights_update_per_epizode == 0:
self.target_update()
# TEST ON VIDEO
# print('Testing...')
if (self.current_epizode > REWARD_AVERAGE_WINDOW) and (self.current_epizode % REWARD_AVERAGE_WINDOW == 0):
display.clear_output(wait=True)
reward_history_average.append(np.mean(reward_history[-REWARD_AVERAGE_WINDOW :])) # calc mean history
# just_play_reward = self.play_game(play_steps = self.max_steps) # play game without random
#save best video
# if len(just_play_reward_list) == 0 or just_play_reward > max(just_play_reward_list):
# save_best_video()
# just_play_reward_list.append(just_play_reward) # save reward
print('epizode', self.current_epizode)
# print('just_play_reward: ', just_play_reward)
if not self.tmp_q_target is None:
print('Q_target', self.tmp_q_target[0])
plt.plot(reward_history_average, c = 'blue', label="train score")
# plt.plot(just_play_reward_list, c = 'red', label="test score")
plt.legend(loc="upper left")
# plt.scatter(x = len(reward_history_average)-1, y = just_play_reward, marker = 'x', c = 'red')
plt.show()
print('finished')
# COLLECT and TRAIN in one time
def train_simultaneously(self, train_cycles = 1, REWARD_AVERAGE_WINDOW = 1, train_every = 4, weights_update_per_epizode = 1):
reward_history = list() # save results when training
reward_history_average = list() # суммарные награды по эпизодам (среднее по окну)
just_play_reward_list = list() # save results when testing - without random actions - on video
for self.current_epizode in range(self.num_episodes):
# COLLECT and TRAIN epoch
# print('Collecting...')
reward_per_epoch = self.play_and_train_epoch(train_cycles = train_cycles, train_every = train_every)
reward_history.append(reward_per_epoch)
# TArget NN weights update
if self.current_epizode % weights_update_per_epizode == 0:
self.target_update()
# TEST ON VIDEO
# print('Testing...')
if (self.current_epizode > REWARD_AVERAGE_WINDOW) and (self.current_epizode % REWARD_AVERAGE_WINDOW == 0):
display.clear_output(wait=True)
reward_history_average.append(np.mean(reward_history[-REWARD_AVERAGE_WINDOW :])) # calc mean history
# just_play_reward = self.play_game(play_steps = self.max_steps) # play game without random
#save best video
# if len(just_play_reward_list) == 0 or just_play_reward > max(just_play_reward_list):
# save_best_video()
# just_play_reward_list.append(just_play_reward) # save reward
print('epizode', self.current_epizode)
# print('just_play_reward: ', just_play_reward)
if not self.tmp_q_target is None:
print('Q_target', self.tmp_q_target[0])
plt.plot(reward_history_average, c = 'blue', label="train score")
# plt.plot(just_play_reward_list, c = 'red', label="test score")
plt.legend(loc="upper left")
# plt.scatter(x = len(reward_history_average)-1, y = just_play_reward, marker = 'x', c = 'red')
plt.show()
print('finished')
NUM_EPISODES = 100000 # NUMBER OF EPISODES TO TRAIN
MAX_STEPS = 30000 # MAXUMAL AMOUNT OF STEPS PER EPISODE
REWARD_AVERAGE_WINDOW = NUM_EPISODES // 10 # окно для усреднения наград по эпизодам
FRAME_STACK_SIZE = 4 # num images in stack for inference
FRAME_SIZE = (80, 80, FRAME_STACK_SIZE)
EXP_STACK_SIZE = 10000 # MAXIMAL NUMBER OF ELEMENTS IN MEMORY STACK
EXP_MINIBATCH_SIZE = 1024 #16
GAMMA = 0.99 # DISCOUNTING PARAMETER
LEARNING_RATE = 0.00025 # 0.00025
SKIP_FRAMES = 0 # 3 = amout of states to skip - saves every 4th
###########################################
###########################################
# CREATE GLOBAL OBJECTS
# MEMORY STACK. ACCRUES ALL STEPS DATA. MAKES ZIP several IMAGES of several steps to Tensor
experience_stack = Exp_stack(max_size = EXP_STACK_SIZE,
batch_size = EXP_MINIBATCH_SIZE,
inf_pack_size = FRAME_STACK_SIZE) # conneclion of S, A, R, S1, DONE chains
# FRAME. Image stack for inference # USED FOR EVERY STEP TO GET AN ACTION
frame = Image_stack(FRAME_STACK_SIZE)
# CREATE GAME INVIRONMENT
env = gym.make('PongNoFrameskip-v4', render_mode="rgb_array", obs_type="grayscale")
env = gym.wrappers.ResizeObservation(env, (105, 80))
# CREATE and BUILD DQN model
dqn_main = DQN(frame_size = FRAME_SIZE, num_actions = NUM_ACTIONS, lr = LEARNING_RATE) # Deep Q Net object
dqn_main.build_model()
dqn_target = DQN(frame_size = FRAME_SIZE, num_actions = NUM_ACTIONS, lr = LEARNING_RATE) # Deep Q Net object
dqn_target.build_model()
# CREATE BOT
bot = Bot(env = env,
num_episodes = NUM_EPISODES,
max_steps = MAX_STEPS, # steps per episode
frame = frame, experience_stack = experience_stack,
dqn_main = dqn_main,
dqn_target = dqn_target,
gamma = GAMMA,
skip = SKIP_FRAMES
)
# START TRAINING
bot.train(train_cycles = 1, REWARD_AVERAGE_WINDOW = 100, weights_update_per_epizode = 1000)