Issue with training DQN Agent to play pong. No progress during the learning

25 Views Asked by At

I'm trying to train an Agent to play Pong using Gym environment. During the learning process I collect the Reward at every episode and build the reward curve based on this data. My problem is - no progression in the learning process. I mean the reward curve should rise up, but it doesn't. The reward per episode in average is between -19 to - 20, where - 21 is the worst result.

I would like to ask an advise what should I amend in my code, because it looks like I'm missing something.

I've read a lot of topics with discussion about the best hyperparameters and also https://arxiv.org/pdf/1511.06581.pdf trying to figure out where is the problem. I tried to train agent using the basic DQN architecture firstly. I found info, that it is one of the basic algorithms and training potentially could take days-weeks. I decided to build more advanced architecture, based on Double Dueling DQN networks, because the training time should be much shorter in compare with the basic DQN algorithm.

Parameters: I tried to repeat the parameters and DQN architecture the same as in https://arxiv.org/pdf/1810.00123.pdf I don't understand why an author used consequent Conv2D layers with the stride. Stride destroys the spatial information, where the tiny "ball" with the size of one or two pixels is the most important information at the picture. Anyway I tried both options - with using the stride and without it.

Preprocessing: environment - tried all from the following: Pong-v4 / PongDeterministic-v4 / PongNoFrameskip-v4 image - grayscale and resized observation to (105, 80, 1). Then I cut the top of picture with the score and finally get (80, 80, 1) of np.float32 type.

stack for inference contains 4 frames (optionally I can make with frameskip parameter)

My current parameters:

NUM_EPISODES = 100000 # NUMBER OF EPISODES TO TRAIN

MAX_STEPS = 30000 # MAXIMAL AMOUNT OF STEPS PER EPISODE

FRAME_STACK_SIZE = 4 # num images in stack for inference FRAME_SIZE = (80, 80, FRAME_STACK_SIZE)

EXP_STACK_SIZE = 10000 # MAXIMAL NUMBER OF ELEMENTS IN MEMORY STACK EXP_MINIBATCH_SIZE = 1024 # MINIBATCH SIZE FOR EXPERIENCE REPLAY

GAMMA = 0.99 # DISCOUNTING PARAMETER LEARNING_RATE = 0.00025 # 0.00025

SKIP_FRAMES = 0 # if 3 = amout of states to skip - saves every 4th

My code is below:

env = gym.make('PongNoFrameskip-v4', render_mode="rgb_array", obs_type="grayscale") 
env = gym.wrappers.ResizeObservation(env, (105, 80))
# env = gym.wrappers.ResizeObservation(env, (84, 84))

STATE_SHAPE = env.observation_space.shape
NUM_ACTIONS = env.action_space.n
ACTION_MEANING = env.unwrapped.get_action_meanings()


def state_preprocess(s):  
    s = s[17: -8 , : ] / 255  # ETALON  s = s[34: -16, :] / 255    
    s = s.astype(np.float32)[..., 0]
    s = s.round()    
    return s

class Exp_stack():
    def __init__(self, max_size = 10, batch_size = 5, inf_pack_size = 4):
        self.max_size = max_size
        self.batch_size = batch_size
        self.inf_pack_size = inf_pack_size
        self.stack = list()                    
        self.full = False

    def add(self, new_row):
        self.stack.append(new_row)
        self.keep_max_stack_size() # optimize buffer
        self.is_full() # set state

    def is_full(self):
        
        if len(self.stack) >= self.batch_size:
            self.full = True
        else:
            self.full = False

    def keep_max_stack_size(self):
        if len(self.stack) > self.max_size:
            self.stack = self.stack[- self.max_size :]


    def reset(self):
        self.stack = list()
        self.full = False

    def get_minibatch(self):
        index_list = random.sample(range(0, len(self.stack)), self.batch_size)
        minibatch = [self.stack[idx] for idx in index_list]
        s_batch, action_batch, reward_batch, s1_batch, done_batch = map(np.asarray, zip(*minibatch))

        s_batch = s_batch.astype(np.float32)
        s1_batch = s1_batch.astype(np.float32)
        return s_batch, action_batch, reward_batch, s1_batch, done_batch

        


class Image_stack():
    def __init__(self, max_size = 4):
        self.max_size = max_size        
        self.stack = list()
        self.full = False

    def add(self, new_row):        
        self.stack.append(new_row)

        self.keep_max_stack_size() # optimize buffer
        self.is_full() # set state


    def is_full(self):
        if len(self.stack) == self.max_size:
            self.full = True
        else:
            self.full = False

    def keep_max_stack_size(self):
        
        if len(self.stack) > self.max_size:           
            self.stack = self.stack[-self.max_size : ]



    def get_pack_of_s(self, forward_order = True):
        tmp_list = [image[..., None] for image in [chain[0] for chain in self.stack]
                   ]
        
        if forward_order:
            res = np.concatenate(tmp_list, axis = 2, dtype=np.float32)
        else:
            res = np.concatenate(tmp_list[::-1], axis = 2, dtype=np.float32)
            
        return res

    def get_pack_of_s1(self, forward_order = True):
        tmp_list = [image[..., None] for image in [chain[3] for chain in self.stack]
                   ]
        
        if forward_order:
            res = np.concatenate(tmp_list, axis = 2, dtype=np.float32)
        else:
            res = np.concatenate(tmp_list[::-1], axis = 2, dtype=np.float32)
            
        return res

    
    def prepared_to_save_in_memory(self):
        s_pack = self.get_pack_of_s()
        a_pack = self.stack[-1][1]
        r_pack = self.stack[-1][2]
        s1_pack = self.get_pack_of_s1()
        terminated_pack = self.stack[-1][4]
        
        return s_pack, a_pack, r_pack, s1_pack, terminated_pack
        

    def reset(self):
        self.stack = list()
        self.full = False



class Model(tf.keras.Model):

    def __init__(self, frame_size, num_actions):
        super(Model, self).__init__()

        self.frame_size = frame_size # inference size
        self.num_actions = num_actions

        self.conv1 = tf.keras.layers.Conv2D(filters=32, kernel_size=(8, 8), strides = 2, #strides = 4,     
                                         activation='relu', 
                                         input_shape = self.frame_size,
                                         data_format = 'channels_last',  
                                         # dilation_rate = 3,
                                         kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
                                         bias_initializer=tf.keras.initializers.Zeros(),
                                         
                                         # kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
                                        )

        self.conv2 =  tf.keras.layers.Conv2D(filters=64, kernel_size=(4, 4),  #strides = 2,    
                                         activation='relu', 
                                         input_shape = self.frame_size,
                                         data_format = 'channels_last',  
                                         # dilation_rate = 3,
                                         kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
                                         bias_initializer=tf.keras.initializers.Zeros(),
                                         
                                         # kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
                                        )

        self.conv3 = tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), #strides = 1, 
                                         activation='relu', 
                                         data_format = 'channels_last',
                                         kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
                                         bias_initializer=tf.keras.initializers.Zeros(),
                                         # dilation_rate = 3,
                                         # kernel_regularizer = tf.keras.regularizers.L2(l2=0.01)
                                        )

        self.flatten = tf.keras.layers.Flatten()
        self.dense_V_stream = tf.keras.layers.Dense(
                                        units=512,
                                        activation="relu",
                                        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
                                        bias_initializer=tf.keras.initializers.Zeros(),
                                    )
        self.V = tf.keras.layers.Dense(1)

        
        self.dense_A_stream = tf.keras.layers.Dense(
                                        units=512,
                                        activation="relu",
                                        kernel_initializer=tf.keras.initializers.VarianceScaling(2.0),
                                        bias_initializer=tf.keras.initializers.Zeros(),
                                    )     
        self.A = tf.keras.layers.Dense(self.num_actions)

    def call(self, obs):
        """Forward pass of the neural network with some inputs."""
        obs = self.conv1(obs)
        obs = self.conv2(obs)
        obs = self.conv3(obs)
        obs = self.flatten(obs)
        
        v = self.dense_V_stream(obs)
        v = self.V(v)
        
        a = self.dense_A_stream(obs)
        a = self.A(a)
        
        Q = v + tf.subtract(a, tf.reduce_mean(a, axis=1, keepdims=True))
        return Q




class DQN:
    def __init__(self, frame_size = (160, 160, 4), num_actions = 6, lr = 0.000025):
        

        self.frame_size = frame_size # inference size
        self.num_actions = num_actions
        self.lr = lr
        self.loss = tf.keras.losses.MeanSquaredError()
        # self.optimizer = tf.keras.optimizers.SGD(learning_rate = self.lr) #, momentum=0.9, nesterov=True)    
        self.optimizer = tf.keras.optimizers.Adam(learning_rate = self.lr, clipnorm=10)

        self.model = None

    def build_model(self):    
        model = Model(self.frame_size, self.num_actions)        
        model.compile(loss = self.loss, optimizer = self.optimizer)
        rows, cols, phi = self.frame_size
        model.build(input_shape = (None, rows, cols, phi))
        self.model =  model

    def evalQ_single(self, s):
        inp = s[None, ...]
        return self.model(inp).numpy()[0]
    
    def evalQ_batch(self, s):
        return self.model(s).numpy()



class Bot():
    def __init__(self, 
                 env = env,
                 num_episodes = 1000,
                 max_steps = 1000, # steps per episode
                 frame = Image_stack(4), 
                 experience_stack = Exp_stack(max_size = 1000, batch_size = 16, inf_pack_size = 4)  , 
                 dqn_main = None,   # used for Predict next Action
                 dqn_target = None, # used for Evaluate Q_target
                 gamma = 0.99,
                 skip = 3
                ):

        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.frame = frame
        self.experience_stack = experience_stack
        
        self.dqn_main = dqn_main
        self.dqn_target = dqn_target
        self.target_update()
        
        self.gamma = gamma
        self.skip = skip

        self.current_epizode = 0
        self.current_step = 0

        self.tmp_q_target = None

    
    def target_update(self):
        weights = self.dqn_main.model.get_weights()
        self.dqn_target.model.set_weights(weights)
    
    def get_eps(self):
        return 1.0 - ( self.current_epizode / self.num_episodes)**0.2
        
    def get_r_s1_term_chain(self, a_):
        s1_, r_, terminated_, truncated_, info_ = self.env.step(a_) # make a STEP
        s1_ = state_preprocess(s1_)
        self.current_step += 1
        return r_, s1_, terminated_
       
    
            
 
    def play_epoch_collect_data(self):
        self.current_step = 0
        self.frame.reset()
        
        s = self.env.reset()[0] # RESET ENVIRONMENT  
        s = state_preprocess(s)
        a = self.env.action_space.sample()

        reward_per_epoch = 0

        while self.current_step < self.max_steps:       
            
            
            reward_per_frame = 0
            while not self.frame.full:
                
                r, s1, terminated = self.get_r_s1_term_chain(a)
                reward_per_frame += r

                ## SAVE IN MEMORY
                if self.current_step % (self.skip + 1) == 0:                
                    self.frame.add([s, a, r , s1, terminated])         # REWARD KOEFICIENT MULTIPLICATION
    
                s = copy.deepcopy(s1) 
                
                if terminated:
                    self.frame.reset() 
                    break

            else:
                prepared_pack = self.frame.prepared_to_save_in_memory()     # Extract_pack from colected sars1 chains
                self.experience_stack.add(prepared_pack)                    # Save extracted_pack into MEMORY
                self.frame.reset() 

                reward_per_epoch += reward_per_frame
                reward_per_frame = 0
                
    
                if np.random.rand() < self.get_eps():                       # get random action
                    a = self.env.action_space.sample()
                    # print('random')
                else:
                    Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0])   # get vector of probabilities of ACTION        
                    a = np.argmax(Q_s_sample)                              # Выбор действия по текущей политике 
                    # print('by policy')
  
        return reward_per_epoch
 

    def train_on_collected_data(self, train_cycles = 1):
        for i_cycle in range(train_cycles):
                
            #################################################
            # TRAIN Q FUNCTION on BINIBATCH 
            if self.experience_stack.full:

                s_batch, action_batch, reward_batch, s1_batch, done_batch = self.experience_stack.get_minibatch()
                                

                
                Q_s  = self.dqn_target.evalQ_batch(s_batch)
                # print('Q_s', Q_s)
                # print('action_batch', action_batch)

                # next_q_values = self.dqn_target.evalQ_batch(s1_batch).max(axis=1) # DOUBLE DQN
                next_q_values = self.dqn_target.evalQ_batch(s1_batch)[range(self.experience_stack.batch_size),np.argmax(self.dqn_main.evalQ_batch(s1_batch), axis=1)] # DOUBLE DUELLING DQN
                Q_s[range(self.experience_stack.batch_size), action_batch] = reward_batch + (1-done_batch) * next_q_values * self.gamma
                
                self.tmp_q_target = Q_s # just for case
                # print('Q_s type', Q_s.dtype)
                self.dqn_main.model.train_on_batch(s_batch, Q_s) 
                

  
    def play_game(self, play_steps = 1000):
        
        env_sim = RecordVideo(self.env, './video',  episode_trigger = lambda episode_number: True)
        # env_sim = RecordVideo(self.env, './video', video_length = 1000,  step_trigger=lambda x: x % 10 == 0)
        
        self.current_step = 0
        self.frame.reset()
        
        s = env_sim.reset()[0] # RESET ENVIRONMENT  
        s = state_preprocess(s)
        a = env_sim.action_space.sample()

        reward_per_epoch = 0

        while self.current_step < self.max_steps:       
            
            
            reward_per_frame = 0
            while not self.frame.full:
                
                s1, r, terminated, truncated, info = env_sim.step(a) # make a STEP
                s1 = state_preprocess(s1)
                self.current_step += 1
                reward_per_frame += r

                ## SAVE IN MEMORY
                if self.current_step % (self.skip + 1) == 0:                
                    self.frame.add([s, a, r , s1, terminated])         # REWARD KOEFICIENT MULTIPLICATION
    
                s = copy.deepcopy(s1) 
                
                if terminated:
                    self.frame.reset() 
                    break

            else:
                prepared_pack = self.frame.prepared_to_save_in_memory()     # Extract_pack from colected sars1 chains
                self.experience_stack.add(prepared_pack)                    # Save extracted_pack into MEMORY
                self.frame.reset() 

                reward_per_epoch += reward_per_frame
                reward_per_frame = 0
                
    
    
                Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0])   # get vector of probabilities of ACTION        
                a = np.argmax(Q_s_sample)                              # Выбор действия по текущей политике 
                # print('by policy')

            
            

        env_sim.close()     
        show_video()
        return reward_per_epoch

    def play_and_train_epoch(self, train_cycles = 1, train_every = 4):
        pack_counter = 0
        self.current_step = 0
        self.frame.reset()
        
        s = self.env.reset()[0] # RESET ENVIRONMENT  
        s = state_preprocess(s)
        a = self.env.action_space.sample()

        reward_per_epoch = 0

        while self.current_step < self.max_steps:       
            
            
            reward_per_frame = 0
            while not self.frame.full:
                
                r, s1, terminated = self.get_r_s1_term_chain(a)
                reward_per_frame += r

                ## SAVE IN MEMORY
                if self.current_step % (self.skip + 1) == 0:                
                    self.frame.add([s, a, r , s1, terminated])         # REWARD KOEFICIENT MULTIPLICATION
    
                s = copy.deepcopy(s1) 
                
                if terminated:
                    self.frame.reset() 
                    break

            else:
                prepared_pack = self.frame.prepared_to_save_in_memory()     # Extract_pack from colected sars1 chains
                pack_counter += 1
                self.experience_stack.add(prepared_pack)                    # Save extracted_pack into MEMORY
                self.frame.reset() 

                reward_per_epoch += reward_per_frame
                reward_per_frame = 0
                
    
                if np.random.rand() < self.get_eps():                       # get random action
                    a = self.env.action_space.sample()
                    # print('random')
                else:
                    Q_s_sample = self.dqn_main.evalQ_single(prepared_pack[0])   # get vector of probabilities of ACTION        
                    a = np.argmax(Q_s_sample)                              # Выбор действия по текущей политике 
                    # print('by policy')

            ###################################
            # train_cycles = len(self.experience_stack.stack) // self.experience_stack.batch_size
            if self.experience_stack.full and (pack_counter % train_every == 0):
                # print('trained')
                self.train_on_collected_data(train_cycles = train_cycles) # TRAIN

            ###################################
                
        return reward_per_epoch

    
    
    # COLLECT and TRAIN SEPARATELY
    def train(self, train_cycles = 1, REWARD_AVERAGE_WINDOW = 10, weights_update_per_epizode = 1): 
        reward_history = list()            # save results when training
        reward_history_average = list()    # суммарные награды по эпизодам (среднее по окну)
        just_play_reward_list = list()     # save results when testing - without random actions - on video
        
        for self.current_epizode in range(self.num_episodes):

            # COLLECT
            # print('Collecting...')
            reward_per_epoch = self.play_epoch_collect_data()
            reward_history.append(reward_per_epoch)
            
            

            # TRAIN
            # print('Training...')
            self.train_on_collected_data(train_cycles = train_cycles)

            # TArget NN weights update
            if self.current_epizode % weights_update_per_epizode == 0:
                self.target_update()

            # TEST ON VIDEO
            # print('Testing...')
            if (self.current_epizode > REWARD_AVERAGE_WINDOW) and (self.current_epizode % REWARD_AVERAGE_WINDOW == 0):
                display.clear_output(wait=True)
                reward_history_average.append(np.mean(reward_history[-REWARD_AVERAGE_WINDOW :])) # calc mean history
                                              
                # just_play_reward = self.play_game(play_steps = self.max_steps) # play game without random

                #save best video
                # if len(just_play_reward_list) == 0 or just_play_reward > max(just_play_reward_list):
                #     save_best_video()
                    
                # just_play_reward_list.append(just_play_reward) # save reward
                
                
                print('epizode', self.current_epizode)
                # print('just_play_reward: ', just_play_reward)
                if not self.tmp_q_target is None:
                    print('Q_target', self.tmp_q_target[0])
                plt.plot(reward_history_average, c = 'blue', label="train score")
                # plt.plot(just_play_reward_list, c = 'red', label="test score")
                plt.legend(loc="upper left")
                # plt.scatter(x = len(reward_history_average)-1, y = just_play_reward, marker = 'x', c = 'red')
                plt.show()

        print('finished')          

    # COLLECT and TRAIN in one time
    def train_simultaneously(self, train_cycles = 1, REWARD_AVERAGE_WINDOW = 1, train_every = 4, weights_update_per_epizode = 1):
            reward_history = list()            # save results when training
            reward_history_average = list()    # суммарные награды по эпизодам (среднее по окну)
            just_play_reward_list = list()     # save results when testing - without random actions - on video
            
            for self.current_epizode in range(self.num_episodes):
    
                # COLLECT and TRAIN epoch
                # print('Collecting...')
                reward_per_epoch = self.play_and_train_epoch(train_cycles = train_cycles, train_every = train_every)
                reward_history.append(reward_per_epoch)

                # TArget NN weights update
                if self.current_epizode % weights_update_per_epizode == 0:
                    self.target_update()

               
    
                # TEST ON VIDEO
                # print('Testing...')
                if (self.current_epizode > REWARD_AVERAGE_WINDOW) and (self.current_epizode % REWARD_AVERAGE_WINDOW == 0):
                    display.clear_output(wait=True)
                    reward_history_average.append(np.mean(reward_history[-REWARD_AVERAGE_WINDOW :])) # calc mean history
                                                  
                    # just_play_reward = self.play_game(play_steps = self.max_steps) # play game without random
    
                    #save best video
                    # if len(just_play_reward_list) == 0 or just_play_reward > max(just_play_reward_list):
                    #     save_best_video()
                        
                    # just_play_reward_list.append(just_play_reward) # save reward
                    
                    
                    print('epizode', self.current_epizode)
                    # print('just_play_reward: ', just_play_reward)
                    if not self.tmp_q_target is None:
                        print('Q_target', self.tmp_q_target[0])
                    plt.plot(reward_history_average, c = 'blue', label="train score")
                    # plt.plot(just_play_reward_list, c = 'red', label="test score")
                    plt.legend(loc="upper left")
                    # plt.scatter(x = len(reward_history_average)-1, y = just_play_reward, marker = 'x', c = 'red')
                    plt.show()
    
            print('finished')  


NUM_EPISODES = 100000        # NUMBER OF EPISODES TO TRAIN
MAX_STEPS = 30000           # MAXUMAL AMOUNT OF STEPS PER EPISODE
REWARD_AVERAGE_WINDOW = NUM_EPISODES // 10 # окно для усреднения наград по эпизодам

FRAME_STACK_SIZE = 4        # num images in stack for inference
FRAME_SIZE = (80, 80, FRAME_STACK_SIZE)

EXP_STACK_SIZE = 10000     # MAXIMAL NUMBER OF ELEMENTS IN MEMORY STACK
EXP_MINIBATCH_SIZE = 1024 #16

GAMMA = 0.99                # DISCOUNTING PARAMETER
LEARNING_RATE = 0.00025     # 0.00025 

SKIP_FRAMES = 0 # 3 = amout of states to skip - saves every 4th
###########################################

###########################################
# CREATE GLOBAL OBJECTS

# MEMORY STACK. ACCRUES ALL STEPS DATA. MAKES ZIP several IMAGES of several steps to Tensor 
experience_stack = Exp_stack(max_size = EXP_STACK_SIZE,  
                             batch_size = EXP_MINIBATCH_SIZE,
                             inf_pack_size = FRAME_STACK_SIZE) # conneclion of S, A, R, S1, DONE chains

# FRAME. Image stack for inference # USED FOR EVERY STEP TO GET AN ACTION
frame = Image_stack(FRAME_STACK_SIZE) 



# CREATE GAME INVIRONMENT
env = gym.make('PongNoFrameskip-v4', render_mode="rgb_array", obs_type="grayscale") 
env = gym.wrappers.ResizeObservation(env, (105, 80))


# CREATE and BUILD DQN model
dqn_main = DQN(frame_size = FRAME_SIZE, num_actions = NUM_ACTIONS, lr = LEARNING_RATE) # Deep Q Net object
dqn_main.build_model()

dqn_target = DQN(frame_size = FRAME_SIZE, num_actions = NUM_ACTIONS, lr = LEARNING_RATE) # Deep Q Net object
dqn_target.build_model()


# CREATE BOT 
bot = Bot(env = env,  
             num_episodes = NUM_EPISODES,
             max_steps = MAX_STEPS, # steps per episode
             frame = frame, experience_stack = experience_stack,
             dqn_main = dqn_main,
             dqn_target = dqn_target,
             gamma = GAMMA,
             skip = SKIP_FRAMES
            )

# START TRAINING
bot.train(train_cycles = 1, REWARD_AVERAGE_WINDOW = 100, weights_update_per_epizode = 1000)


enter image description here

0

There are 0 best solutions below