I am currently working on a RL agent that has to learn how to play pacman. I implemented a PPO algorithm. It worked fine on a small map with only a single pile to eat and one ghost. (The pile was spawned randomly at each episode).
I then decided to go on a "classic" map. With piles at every empty space 2 ghosts and 2 powerups.
I have tried many configurations so far, but the policy loss keep decreasing too fast. In less that 50 episodes the loss is close to 0 but the reward is still negative and the behavior of the Pacman is really dumb.
Here is the code of my PPO algorithm :
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import gym
import random
DEVICE = "cpu"
# Policy and value model
class ActorCriticNetwork(nn.Module):
def __init__(self, obs_space_size, action_space_size):
super().__init__()
self.shared_layers = nn.Sequential(
nn.Linear(obs_space_size, 256),
nn.Tanh(),
nn.Linear(256, 256),
nn.Tanh(),
nn.Linear(256, 256),
nn.Tanh(),
)
self.policy_layers = nn.Sequential(
nn.Linear(256, 256),
nn.Tanh(),
nn.Linear(256, action_space_size),
nn.Softmax(dim=-1)
)
self.value_layers = nn.Sequential(
nn.Linear(256, 256), nn.Tanh(), nn.Linear(256, 1)
)
def value(self, obs):
z = self.shared_layers(obs)
value = self.value_layers(z)
return value
def policy(self, obs):
z = self.shared_layers(obs)
policy_logits = self.policy_layers(z)
return policy_logits
def forward(self, obs):
z = self.shared_layers(obs)
policy_logits = self.policy_layers(z)
value = self.value_layers(z)
return policy_logits, value
# def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
# if isinstance(layer,nn.Linear):
# torch.nn.init.orthogonal_(layer.weight, std)
# torch.nn.init.constant_(layer.bias, bias_const)
# return layer
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
# Xavier/Glorot initialization
if isinstance(layer, nn.Linear):
fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(layer.weight)
std = np.sqrt(2.0 / (fan_in + fan_out))
nn.init.normal_(layer.weight, mean=0.0, std=std)
nn.init.constant_(layer.bias, bias_const)
return layer
class PPOTrainer:
def __init__(
self,
actor_critic,
ppo_clip_value=0.2,
target_kl_div=0.01,
max_policy_train_iters=80,
value_train_iters=80,
policy_lr=3e-4,
value_lr=1e-2,
):
self.ac = actor_critic
self.ppo_clip_val = ppo_clip_value
self.target_kl_vid = target_kl_div
self.max_policy_train_iters = max_policy_train_iters
self.value_train_iters = value_train_iters
policy_params = list(self.ac.shared_layers.parameters()) + list(
self.ac.policy_layers.parameters()
)
self.policy_optim = optim.Adam(policy_params, lr=policy_lr)
value_params = list(self.ac.shared_layers.parameters()) + list(
self.ac.value_layers.parameters()
)
self.value_optim = optim.Adam(value_params, lr=value_lr)
def train_policy(self, obs, acts, old_log_probs, gaes):
losses = []
for _ in range(self.max_policy_train_iters):
self.policy_optim.zero_grad()
new_logits = self.ac.policy(obs)
new_logits = Categorical(logits=new_logits)
new_log_probs = new_logits.log_prob(acts)
policy_ratio = torch.exp(new_log_probs - old_log_probs)
clipped_ratio = policy_ratio.clamp(
1 - self.ppo_clip_val, 1 + self.ppo_clip_val
)
clipped_loss = clipped_ratio * gaes
full_loss = policy_ratio * gaes
# policy_loss = -torch.min(full_loss, clipped_loss).mean()
# Dans la boucle d'entraînement de la politique
policy_loss = -torch.min(full_loss, clipped_loss).mean() - 0.01* new_logits.entropy().mean()
policy_loss.backward()
losses.append(policy_loss.item())
self.policy_optim.step()
kl_div = (old_log_probs - new_log_probs).mean()
if kl_div >= self.target_kl_vid:
break
return np.asarray(losses).mean()
def train_value(self, obs, returns):
loss = []
for _ in range(self.value_train_iters):
self.value_optim.zero_grad()
values = self.ac.value(obs)
value_loss = (returns - values) ** 2
value_loss = value_loss.mean()
value_loss.backward()
loss.append(value_loss.item())
self.value_optim.step()
return np.asarray(loss).mean()
def discount_rewards(rewards, gamma=0.99):
"""
Return discounted rewards based on the given rewards and gamma param
"""
new_rewards = [float(rewards[-1])]
for i in reversed(range(len(rewards) - 1)):
new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
return np.array(new_rewards[::-1])
def calculate_gaes(rewards, values, gamma=0.99, decay=0.95):
next_values = np.concatenate([values[1:], [0]])
deltas = [
rew + gamma * next_val - val
for rew, val, next_val in zip(rewards, values, next_values)
]
gaes = []
gae = 0
for delta in reversed(deltas):
gae = delta + gamma * decay * gae
gaes.append(gae)
gaes.reverse()
return np.array(gaes)
def rollout(model, env,epsilon=0.2):
"""
Performs a single rollout.
Returns training data in the shape (n_steps, observation_shape) and the cumulative reward.
"""
### Create data storage
train_data = [[], [], [], [], []] # obs,act,reward,values,act_log_probs
obs, _ = env.reset()
ep_reward = 0
while True:
logits, val = model(torch.tensor(obs, dtype=torch.float32, device=DEVICE))
act_distribution = Categorical(logits=logits)
if random.random() < epsilon:
act = torch.tensor([env.action_space.sample()], device=DEVICE, dtype=torch.int32)
else:
# Exploit: choose the action with the highest probability from the policy
act = act_distribution.sample()
act_log_prob = act_distribution.log_prob(act).item()
act, val = act.item(), val.item()
next_obs, reward, done, _, _ = env.step(act)
for i, item in enumerate((obs, act, reward, val, act_log_prob)):
train_data[i].append(item)
obs = next_obs
ep_reward += reward
if done:
break
train_data = [np.asarray(x) for x in train_data]
train_data[3] = calculate_gaes(train_data[2], train_data[3])
return train_data, ep_reward
I tried many lr (from 1e-2 to 5e-5 , 3e-5 is I think the best one.) My reward function works like this :
- Eating a pile : +1
- Eating a Powerup : +5
- Eating a Ghost : +10
- Losing (Ghost ate pacman) : -20
- Winning (Ate all the piles) : 50
- Making 1000 moves without completing the game : -10
- Doing none of the above : -0.1
The observation is a 1d array representing the map. Where :
- 0 is an empty space
- 1 is a wall
- 2 is the pacman
- 3 is a ghost
- 4 is a powerup
- 5 is a pile
- 6 is a ghost that can be eat
If you have any idea on why I get this behavior please let me know. I attached screenshots of the graphs.