How to fix policy loss quick convergence (PPO)?

Question

How to fix policy loss quick convergence (PPO)?

88 Views Asked by Hamidou At 17 August 2025 at 12:58

I am currently working on a RL agent that has to learn how to play pacman. I implemented a PPO algorithm. It worked fine on a small map with only a single pile to eat and one ghost. (The pile was spawned randomly at each episode).

I then decided to go on a "classic" map. With piles at every empty space 2 ghosts and 2 powerups.

I have tried many configurations so far, but the policy loss keep decreasing too fast. In less that 50 episodes the loss is close to 0 but the reward is still negative and the behavior of the Pacman is really dumb.

Here is the code of my PPO algorithm :

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import gym

import random

DEVICE = "cpu"


# Policy and value model
class ActorCriticNetwork(nn.Module):
    def __init__(self, obs_space_size, action_space_size):
        super().__init__()

        self.shared_layers = nn.Sequential(
            nn.Linear(obs_space_size, 256),
            nn.Tanh(),
            nn.Linear(256, 256),
            nn.Tanh(),
            nn.Linear(256, 256),
            nn.Tanh(),
        )

        self.policy_layers = nn.Sequential(
            nn.Linear(256, 256),
            nn.Tanh(),
            nn.Linear(256, action_space_size),
            nn.Softmax(dim=-1)
        )

        self.value_layers = nn.Sequential(
            nn.Linear(256, 256), nn.Tanh(), nn.Linear(256, 1)
        )

    def value(self, obs):
        z = self.shared_layers(obs)
        value = self.value_layers(z)
        return value

    def policy(self, obs):
        z = self.shared_layers(obs)
        policy_logits = self.policy_layers(z)
        return policy_logits

    def forward(self, obs):
        z = self.shared_layers(obs)
        policy_logits = self.policy_layers(z)
        value = self.value_layers(z)
        return policy_logits, value


# def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
#     if isinstance(layer,nn.Linear):
#         torch.nn.init.orthogonal_(layer.weight, std)
#         torch.nn.init.constant_(layer.bias, bias_const)
#     return layer

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    # Xavier/Glorot initialization
    if isinstance(layer, nn.Linear):
        fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(layer.weight)
        std = np.sqrt(2.0 / (fan_in + fan_out))
        nn.init.normal_(layer.weight, mean=0.0, std=std)
        nn.init.constant_(layer.bias, bias_const)
    return layer

class PPOTrainer:
    def __init__(
        self,
        actor_critic,
        ppo_clip_value=0.2,
        target_kl_div=0.01,
        max_policy_train_iters=80,
        value_train_iters=80,
        policy_lr=3e-4,
        value_lr=1e-2,
    ):
        self.ac = actor_critic
        self.ppo_clip_val = ppo_clip_value
        self.target_kl_vid = target_kl_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_iters = value_train_iters

        policy_params = list(self.ac.shared_layers.parameters()) + list(
            self.ac.policy_layers.parameters()
        )
        self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

        value_params = list(self.ac.shared_layers.parameters()) + list(
            self.ac.value_layers.parameters()
        )

        self.value_optim = optim.Adam(value_params, lr=value_lr)

    def train_policy(self, obs, acts, old_log_probs, gaes):
        losses = []
        for _ in range(self.max_policy_train_iters):
            self.policy_optim.zero_grad()

            new_logits = self.ac.policy(obs)
            new_logits = Categorical(logits=new_logits)
            new_log_probs = new_logits.log_prob(acts)

            policy_ratio = torch.exp(new_log_probs - old_log_probs)
            clipped_ratio = policy_ratio.clamp(
                1 - self.ppo_clip_val, 1 + self.ppo_clip_val
            )
            clipped_loss = clipped_ratio * gaes
            full_loss = policy_ratio * gaes
            # policy_loss = -torch.min(full_loss, clipped_loss).mean()
            # Dans la boucle d'entraînement de la politique
            policy_loss = -torch.min(full_loss, clipped_loss).mean() - 0.01* new_logits.entropy().mean()
            policy_loss.backward()
            losses.append(policy_loss.item())
            self.policy_optim.step()

            kl_div = (old_log_probs - new_log_probs).mean()

            if kl_div >= self.target_kl_vid:
                break

        return np.asarray(losses).mean()

    def train_value(self, obs, returns):
        loss = []
        for _ in range(self.value_train_iters):
            self.value_optim.zero_grad()

            values = self.ac.value(obs)
            value_loss = (returns - values) ** 2
            value_loss = value_loss.mean()

            value_loss.backward()
            loss.append(value_loss.item())
            self.value_optim.step()

        return np.asarray(loss).mean()


def discount_rewards(rewards, gamma=0.99):
    """
    Return discounted rewards based on the given rewards and gamma param
    """

    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards) - 1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])

    return np.array(new_rewards[::-1])


def calculate_gaes(rewards, values, gamma=0.99, decay=0.95):
    next_values = np.concatenate([values[1:], [0]])
    deltas = [
        rew + gamma * next_val - val
        for rew, val, next_val in zip(rewards, values, next_values)
    ]

    gaes = []

    gae = 0
    for delta in reversed(deltas):
        gae = delta + gamma * decay * gae
        gaes.append(gae)

    gaes.reverse()

    return np.array(gaes)


def rollout(model, env,epsilon=0.2):
    """
    Performs a single rollout.
    Returns training data in the shape (n_steps, observation_shape) and the cumulative reward.
    """

    ### Create data storage
    train_data = [[], [], [], [], []]  # obs,act,reward,values,act_log_probs
    obs, _ = env.reset()

    ep_reward = 0

    while True:
        logits, val = model(torch.tensor(obs, dtype=torch.float32, device=DEVICE))
        act_distribution = Categorical(logits=logits)

        if random.random() < epsilon:           
            act = torch.tensor([env.action_space.sample()], device=DEVICE, dtype=torch.int32)
        else:
            # Exploit: choose the action with the highest probability from the policy
            act = act_distribution.sample()
        act_log_prob = act_distribution.log_prob(act).item()

        act, val = act.item(), val.item()

        next_obs, reward, done, _, _ = env.step(act)

        for i, item in enumerate((obs, act, reward, val, act_log_prob)):
            train_data[i].append(item)

        obs = next_obs
        ep_reward += reward

        if done:
            break

    train_data = [np.asarray(x) for x in train_data]
    train_data[3] = calculate_gaes(train_data[2], train_data[3])

    return train_data, ep_reward

I tried many lr (from 1e-2 to 5e-5 , 3e-5 is I think the best one.) My reward function works like this :

Eating a pile : +1
Eating a Powerup : +5
Eating a Ghost : +10
Losing (Ghost ate pacman) : -20
Winning (Ate all the piles) : 50
Making 1000 moves without completing the game : -10
Doing none of the above : -0.1

The observation is a 1d array representing the map. Where :

0 is an empty space
1 is a wall
2 is the pacman
3 is a ghost
4 is a powerup
5 is a pile
6 is a ghost that can be eat

If you have any idea on why I get this behavior please let me know. I attached screenshots of the graphs.

Original Q&A

How to fix policy loss quick convergence (PPO)?

There are 0 best solutions below

Related Questions in PYTHON

Related Questions in NUMPY

Related Questions in PYTORCH

Related Questions in ARTIFICIAL-INTELLIGENCE

Related Questions in WANDB

Trending Questions

Popular # Hahtags

Popular Questions