I'm trying to solve the cartpole-v1 gym environment with only the linear and angular position, but the mean reward of the last 100 episodes isn't greater than 20 rewards. The longest train i made was a train with 90 000 episodes and the agent didn't get more than 20 reward.

The algorithm that i'm using is the tabular method Q-Learning and Epsilon-greedy for action selection.

This is the code, i implement:

import gymnasium as gym
import numpy as np
import math

# Axis dimensions
max_x = 2.4
min_x = -2.4
max_theta = 12
min_theta = -12
x_bins = 24
theta_bins = 24

x_axis_space = np.linspace(min_x, max_x, x_bins)
theta_axis_space = np.linspace(min_theta, max_theta, theta_bins)

# Env setup - First state
env = gym.make('CartPole-v1')
obs, info = env.reset() 

last_state = state = (np.digitize(obs[0], x_axis_space), np.digitize(obs[2]*180/math.pi, theta_axis_space))

# import ipdb; ipdb.set_trace()
# print(state)

# Hyperparameters
GAMMA = 0.99        # Discount factor (0.95 Looking harder for Long-term reward)
ALPHA = 0.1         # Learning rate
EPSILON = 1         # 100% of exploration
N_EPISODES = 4000
MAX_REWARD = 500
total_reward = 0
episode_reward = []
EPSILON = 1.0 
DECAY_RATIO = 1-0.00001


class CartPoleQAgent():
    def __init__(self, n_bins_x, n_bins_theta, n_actions):       # x  theta L//R
        self.n_bins_x = n_bins_x
        self.n_bins_theta = n_bins_theta
        self.n_actions = n_actions
        self.q_table = np.zeros((n_bins_x+1, n_bins_theta+1, n_actions))


def exp_dec_epsilon_greedy(q_table, state, finish_training):
    global EPSILON
    
    if np.random.random() > EPSILON or finish_training == 1:    # Exploit
        # Select the greedy action max Q
        max_q = q_table[state[0]][state[1]].max()
        for i in range(2):
            if max_q == q_table[state[0]][state[1]][i]:
                return i
    else:                                                       # Explore
        # Select a random action
        return env.action_space.sample()


def update_q_value(q_table, last_state, action, reward, state):
    global GAMMA, ALPHA
    # import ipdb; ipdb.set_trace()
    action = int(action)
    last_x = last_state[0]  
    last_theta = last_state[1]
    x = state[0] 
    theta = state[1]

    return (q_table[last_x][last_theta][action] + ALPHA*(reward + GAMMA*q_table[x][theta][action] - q_table[last_x][last_theta][action]))


if __name__ == "__main__":
    
    agent = CartPoleQAgent(24, 24, 2)
    finish_training = 0
    i_episode = 0
    mean_reward = 0
    
    while mean_reward < 500:
        i_episode_reward = 0  
        
        while True:     # End of an episode
            action = exp_dec_epsilon_greedy(agent.q_table, state, finish_training)
            
            result = env.step(action)  
            obs, reward, done, info = result[:4]
            i_episode_reward = i_episode_reward + reward

            if done:    # If the episode has ended
                env.reset() # Always the cartpole end conditions are met, to reboot the env
                break
            
            state = (np.digitize(obs[0], x_axis_space), np.digitize(obs[2]*180/math.pi, theta_axis_space))
            agent.q_table[last_state[0]][last_state[1]][action] = update_q_value(agent.q_table, last_state, action, reward, state)
            last_state = state
        
        episode_reward.append(i_episode_reward)
        EPSILON = EPSILON * DECAY_RATIO 

        mean_reward = np.mean(episode_reward[len(episode_reward)-100:])    
        print("Episode: " + str(i_episode) + " Episode Reward: " + str(i_episode_reward) + " eps: " + str(EPSILON) + " Mean Reward: " + str(mean_reward))
        i_episode = i_episode + 1 
    
env.close()

i try that code implementation and as i said i dind't got more than 20 avearage reward.

0

There are 0 best solutions below