DDPG not converging for gym inverted pendulum problem

74 Views Asked by At

Im new to RL and PyTorch but for my first project im training a DDPG DQN to solve the inverted pendulum problem. The thing is after referencing code online and fixing some issues, I finally got the model to run but it never improves. It seems like the model is not training at all. Is there something wrong with my code implementation? Or does it have to do with my hyperparameters not being just right. Thank you!

This is my code

class ReplayBuffer:
    def __init__(self, buffer_limit):
        self.buffer = deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_mask_lst.append([done_mask])

        s_batch = torch.tensor(s_lst, dtype=torch.float)
        a_batch = torch.tensor(a_lst, dtype=torch.float)
        r_batch = torch.tensor(r_lst, dtype=torch.float)
        s_prime_batch = torch.tensor(s_prime_lst, dtype=torch.float)
        done_batch = torch.tensor(done_mask_lst, dtype=torch.float)

        return s_batch, a_batch, r_batch, s_prime_batch, done_batch

    def size(self):
        return len(self.buffer)
class Actor(nn.Module):
    """
    Actor (Policy) Model for the DDPG algorithm.
    """

    def __init__(
        self, state_size, action_size
    ):
        """
        Initialize actor model.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            fc1_units (int): Number of nodes in the first hidden layer.
            fc2_units (int): Number of nodes in the second hidden layer.
        """
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        """
        Reset model weights with appropriate initialization.
        """
        # self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        # self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """
        Build an actor (policy) network that maps states to actions.

        Args:
            state (torch.Tensor): Input state.

        Returns:
            torch.Tensor: Output actions.
        """
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        outputs = F.tanh(self.fc3(x))
        outputs = torch.mul(2, outputs) # Action space between -2.0 and 2.0
        return outputs


class Critic(nn.Module):
    """
    Critic (Value) Model for the DDPG algorithm.
    """

    def __init__(self, state_size, action_size):
        """
        Initialize critic model.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            fcs1_units (int): Number of nodes in the first hidden layer.
            fc2_units (int): Number of nodes in the second hidden layer.
        """
        super(Critic, self).__init__()
        # State input
        self.fcs1 = nn.Linear(state_size, 16)
        self.fcs2 = nn.Linear(16, 32)

        # Action input
        self.fca1 = nn.Linear(action_size, 32)

        self.fc1 = nn.Linear(64, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    #     self.reset_parameters()

    # def reset_parameters(self):
    #     """
    #     Reset model weights with appropriate initialization.
    #     """
    #     self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
    #     self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
    #     self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
    #     self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """
        Build a critic (value) network that maps (state, action) pairs to Q-values.

        Args:
            state (torch.Tensor): Input state.
            action (torch.Tensor): Input action.

        Returns:
            torch.Tensor: Output Q-values.
        """
        state_out = F.relu(self.fcs1(state))
        state_out = F.relu(self.fcs2(state_out))
        action_out = F.relu(self.fca1(action))
        x = torch.cat((state_out, action_out), dim=-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, mu=0.0, theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array(
            [random.random() for i in range(len(x))]
        )
        self.state = x + dx
        return self.state
class DDPGAgent:
    """Interacts with and learns from the environment."""

    def __init__(self):
        """Initialize an Agent object."""
        self.state_dim = 3
        self.action_dim = 1
        self.actor_lr = 0.001
        self.critic_lr = 0.001
        self.batch_size = 64
        self.buffer_limit = 50000
        self.tau = 0.005
        self.gamma = 0.99

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_dim, self.action_dim)
        self.actor_target = Actor(self.state_dim, self.action_dim)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(), lr=self.actor_lr
        )

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_dim, self.action_dim)
        self.critic_target = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.critic_lr,
        )

        # Noise process
        self.noise = OUNoise(self.action_dim)

        # Replay memory
        self.memory = ReplayBuffer(self.buffer_limit)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.put((state, action, reward, next_state, done))

        # Learn, if enough samples are available in memory
        if self.memory.size() > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float()
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -2, 2)

    def reset(self):
        """Reset the noise process."""
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.

        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        actions = actions.squeeze(dim=1)

        # Update critic
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.

        Params:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + local_param.data * self.tau)
def train_DDPGAgent():
    # Initalize the DDPG Agent and related variables required
    agent = DDPGAgent()
    env = gym.make('Pendulum-v1', g=9.81)
    episodes = 800
    total_rewards = []
    no_of_steps = []
    success_count = 0
    frames = []
    best_episode = 0 
    best_reward = float('-inf')

    for episode in range(episodes):
        state = env.reset()
        score, done = 0.0, False
        start_time = datetime.datetime.now()
        counter = 0

        while not done:
            counter += 1
            action = agent.act(state, add_noise=True)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)

            score += reward
            state = next_state

            # if counter % 50 == 0 and score > -50:
            #     screen = env.render(mode='rgb_array')
            #     frames.append(screen)

        # Recording results
        if len(total_rewards) > 0:
            success_count += (score - total_rewards[-1]) >= 200
        total_rewards.append(score)
        no_of_steps.append(counter)

        if score > best_reward:
            best_reward = score
            best_episode = episode

        # Saving the Models
        save_folder = "DDPG"
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        if episode == best_episode:
            model_actor = os.path.join(save_folder, "DDPG_actor" + str(episode) + ".pt")
            model_critic = os.path.join(save_folder, "DDPG_critic" + str(episode) + ".pt")
            torch.save(agent.actor_local.state_dict(), model_actor)
            torch.save(agent.critic_local.state_dict(), model_critic)
            
        if episode % 10 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print('Episode {:>4} | Total Reward: {:>8.2f} | Elapsed: {}'.format(episode, score, elapsed_time))

    env.close()

    return {
        'total_rewards': total_rewards,
        'no_of_steps': no_of_steps,
        'success_count': success_count,
        'frames': frames
    }

DDPG_results = train_DDPGAgent()

I've tried tuning the hyperparameters from critic lr, actor lr, batch size, buffer size, but it doesnt seem to help. Also the tensorflow code I had referenced for this had incredible results which is why I was confused when I could not get anything close to it. I've been trying for so long to get it to work but it just isnt working. Any help is appreciated!

References:

  1. Tensorflow code: https://keras.io/examples/rl/ddpg_pendulum/
  2. Pytorch article on DDPG for mountain cart: https://archive.is/2nuI8
  3. Pytorch DDPG gym-pendulum problem: https://github.com/udacity/deep-reinforcement-learning/blob/master/ddpg-pendulum/DDPG.ipynb
0

There are 0 best solutions below