I'm trying to implement a DQN model of Pong game. However, it still performs like random activities even after about 1000 episodes. The CNN training seems not improve the agents.
Here is my main code:
I create a CNN including three convolution layers after pooling and three forward connection layers. the input channels is the number of pre-processed frame (from 3210160 to 48484 and the channel is 4):
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
After that, I construct an agent class with action selection and CNN training functions. In CNN training function, I use the batch input to come out the loss value instead of step-by-step for-loop iteration of batch data. before coming out the loss and backward, I transform the input image data into vectors of batch size. Here is the agent class:
class Agent():
def __init__(self, s_space, a_space, device) -> None:
# set GPU device to cuda
self.device = device
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space).to(self.device)
self.tgt_net = CNN(s_space, a_space).to(self.device)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# pre-process the input image data
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
s_v = th.Tensor(s_v).to(self.device)
a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)
r_v = th.FloatTensor(r_v).to(device)
# print(r_v.shape)
return s_v, a_v, next_s_v, r_v, dones
# record the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# save CNN model
def save(self):
th.save(self.evl_net.state_dict(), "./Pong.pth")
# at the beginning load the saved CNN model
def load(self,s_channels, a_space):
self.evl_net = CNN(s_channels, a_space).to(self.device)
self.evl_net.load_state_dict(th.load("./Pong.pth"))
# DQN replay progression
def train(self,state,batch_size):
"""
s_v_size: [batch_size,4,84,84] type: Tensor
s_a_size: [batch_size,1] type: Tensor
next_s_v_size: [batch_size,4,84,84] type: List
r_v_size: [1,batch_size] type: Tensor
dones_size: [batch_size] type: List
"""
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
# create evl_Q_value tensor
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
# correctly transform next_s_v into tensor:
nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
tgt_Q_value = th.zeros(batch_size).to(device)
true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements
true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
# print(true_next_s_v.shape)
tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
# print(tgt.shape)
# update tgt_Q_value
tgt_Q_value[nonDone_index] = tgt
tgt_Q_value = r_v + self.gamma * tgt_Q_value
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [batch_size, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
loss.backward()
# constrain the gradient from explosion
for p in self.evl_net.parameters():
p.grad.data.clamp_(-1, 1)
self.optimal.step()
# decrease fire
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
In the main training progress, I set the batch size increasing from 32 to 64 for accelerating the operation. The CNN will be updated each four episodes. The statistic information will be printed each ten episodes.
# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space, device)
agent.load(channels, a_space)
# testing start:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
img = plt.imshow(env.render('rgb_array'))
done = False
score = 0
while not done:
# step 2: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
reward = -1.0
next_s = None
else:
next_s = np.array(next_s)
# print(next_s.shape)
# step 3: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 4: update state steps
s = next_s
score += reward
# step 5: training and update CNN by each 4 episodes
if(len(agent.memory) > batch_size and e % 4 == 0):
agent.train(channels,batch_size)
agent.save()
# appendix 1: at the beginning increase batch_size from 32 to 64
if(batch_size < 64):
batch_size += 1
# appendix 2: return score by each 10 episodes
if(e % 10 == 0 and len(agent.memory)>batch_size):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
During running there is not any error information reminded. However, the agent does not perform as well as expected. After 1000 episodes, it still returns minus score as it did at the very start. The output is like this:
episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29
I rechecked the structure of the model according to the algorithm theory but found nothing different. I hope to get some advice and help on how to deal with this problem.