DQN model (Game: Atari PongNoFrameskip) does not learn

Question

DQN model (Game: Atari PongNoFrameskip) does not learn

424 Views Asked by speedhawk1 At 17 August 2025 at 13:24

I'm trying to implement a DQN model of Pong game. However, it still performs like random activities even after about 1000 episodes. The CNN training seems not improve the agents.

Here is my main code:

I create a CNN including three convolution layers after pooling and three forward connection layers. the input channels is the number of pre-processed frame (from 3210160 to 48484 and the channel is 4):

class CNN(nn.Module):
  def __init__(self, s_channels, a_space):
    super(CNN, self).__init__()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
    self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
    self.conv2 = nn.Conv2d(32,64,4,2)
    self.conv3 = nn.Conv2d(64,64,3,1)
    self.fc1 = nn.Linear(64*4*4,1024)
    self.fc2 = nn.Linear(1024,512)
    self.fc3 = nn.Linear(512,a_space)

  def forward(self,input):
    output = self.pool(F.relu(self.conv1(input)))
    output = self.pool(F.relu(self.conv2(output)))
    output = self.pool(F.relu(self.conv3(output)))
    output = output.view(-1,64*4*4)
    output = F.relu(self.fc1(output))
    output = F.relu(self.fc2(output))
    output = F.relu(self.fc3(output))
    return output

After that, I construct an agent class with action selection and CNN training functions. In CNN training function, I use the batch input to come out the loss value instead of step-by-step for-loop iteration of batch data. before coming out the loss and backward, I transform the input image data into vectors of batch size. Here is the agent class:

class Agent():
  def __init__(self, s_space, a_space, device) -> None:

    # set GPU device to cuda
    self.device = device

    # define parameters
    self.epsilon = 1.0
    self.min_epsilon = 0.01
    self.dr = 0.995
    self.lr = 0.001
    self.gamma = 0.9

    # define models
    self.evl_net = CNN(s_space, a_space).to(self.device)
    self.tgt_net = CNN(s_space, a_space).to(self.device)
    self.cert = nn.SmoothL1Loss()
    self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)

    # define memory store
    self.memory = deque(maxlen=2000)

  # pre-process the input image data
  def data_pre_process(self,batch_size):
    s_v = []
    a_v = []
    next_s_v = []
    r_v = []
    dones = []
    materials = random.sample(self.memory,batch_size)
    for t in materials:
      s_v.append(t[0])
      a_v.append(t[1])
      next_s_v.append(t[2])
      r_v.append(t[3])
      dones.append(t[4])

    s_v = th.Tensor(s_v).to(self.device)         
    a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)                                 
    r_v = th.FloatTensor(r_v).to(device)        
    # print(r_v.shape)
    return s_v, a_v, next_s_v, r_v, dones      

  # record the transformed images
  def record(self,tpl):
    self.memory.append(tpl)
  
  # select actions according to the states (input images with 4 channels)
  def select(self,state,a_space):
    actions = self.evl_net(state).data.tolist()
    if(random.random() <= self.epsilon):
      action = random.randint(0,a_space-1)
    else:
      action = actions.index(max(actions))
    return action

  # save CNN model
  def save(self):
    th.save(self.evl_net.state_dict(), "./Pong.pth")

  # at the beginning load the saved CNN model
  def load(self,s_channels, a_space):
    self.evl_net = CNN(s_channels, a_space).to(self.device)
    self.evl_net.load_state_dict(th.load("./Pong.pth"))

  # DQN replay progression
  def train(self,state,batch_size):
    """
    s_v_size: [batch_size,4,84,84] type: Tensor
    s_a_size: [batch_size,1] type: Tensor
    next_s_v_size: [batch_size,4,84,84] type: List
    r_v_size: [1,batch_size] type: Tensor
    dones_size: [batch_size] type: List

    """
    s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
    self.tgt_net.load_state_dict(self.evl_net.state_dict())

    # create evl_Q_value tensor
    evl_Q_value = self.evl_net(s_v).gather(0,a_v) #  size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
    
    # correctly transform next_s_v into tensor：
    nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
    tgt_Q_value = th.zeros(batch_size).to(device) 

    true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements  
    true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
    # print(true_next_s_v.shape)

    tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
    # print(tgt.shape)
    
    # update tgt_Q_value
    tgt_Q_value[nonDone_index] = tgt
    tgt_Q_value = r_v + self.gamma * tgt_Q_value
    tgt_Q_value = tgt_Q_value.reshape(batch_size,1) #  size: [batch_size, 1] cannot be back propagated
    # print(tgt_Q_value)
    
    self.optimal.zero_grad()
    loss = self.cert(evl_Q_value, tgt_Q_value)
    loss.backward()
    
    # constrain the gradient from explosion
    for p in self.evl_net.parameters():
      p.grad.data.clamp_(-1, 1)
    
    self.optimal.step()

    # decrease fire
    if(self.epsilon > self.min_epsilon):
      self.epsilon *= self.dr

In the main training progress, I set the batch size increasing from 32 to 64 for accelerating the operation. The CNN will be updated each four episodes. The statistic information will be printed each ten episodes.

# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")

# set episode step and batch_size
episodes = 5000
batch_size = 32

env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
    
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n

agent = Agent(channels, a_space, device)
agent.load(channels, a_space)

# testing start:

for e in range(episodes):

  # step 1: reset the agent at the beginning
  s = np.array(env.reset())
  img = plt.imshow(env.render('rgb_array'))
  done = False
  score = 0

  while not done:
  
    # step 2: iterate actions
    a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
    next_s, reward, done, _ = env.step(a)
    if(done==True):
      reward = -1.0
      next_s = None
    else:
      next_s = np.array(next_s)  
    # print(next_s.shape)

    # step 3: record the data into buffer
    dataset = (s,a,next_s,reward,done)
    agent.record(dataset)

    # step 4: update state steps
    s = next_s
    score += reward

  # step 5: training and update CNN by each 4 episodes
  if(len(agent.memory) > batch_size and e % 4 == 0):
    agent.train(channels,batch_size)
    agent.save()
  
  # appendix 1: at the beginning increase batch_size from 32 to 64
  if(batch_size < 64):
    batch_size += 1

  # appendix 2: return score by each 10 episodes
  if(e % 10 == 0 and len(agent.memory)>batch_size):
    print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))

During running there is not any error information reminded. However, the agent does not perform as well as expected. After 1000 episodes, it still returns minus score as it did at the very start. The output is like this:

episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29

I rechecked the structure of the model according to the algorithm theory but found nothing different. I hope to get some advice and help on how to deal with this problem.

Original Q&A

DQN model (Game: Atari PongNoFrameskip) does not learn

There are 0 best solutions below

Related Questions in DEEP-LEARNING

Related Questions in PYTORCH

Related Questions in CONV-NEURAL-NETWORK

Related Questions in REINFORCEMENT-LEARNING

Related Questions in DQN

Trending Questions

Popular # Hahtags

Popular Questions