Issue with Agent Collecting 10 Products and Returning to Starting Point in a 12x11 Matrix

41 Views Asked by At

I have a 12x11 matrix. I want my agent to start from row 0, column 5 (reward points 1) and collect 10 products (reward points 100) and return to the starting point again. The problem is that once the agency starts its action it only collects 1 reward and stays there. I want it to collect the remaining 10 products and take them to row 0, in short, 5. What is my mistake?

enter image description here

import numpy as np
import pandas as pd

environment_rows = 12
environment_columns = 11

# Hyperparameters
alpha = 0.1  # learning rate
gamma = 0.8  # discount factor
epsilon = 0.99  # exploration rate
num_episodes = 1000  # number of episodes for training

actions = ['up', 'right', 'down', 'left']
q_values = np.zeros((environment_rows, environment_columns, 4))

rewards = np.full((environment_rows, environment_columns), -5.)
rewards[0, 5] = 1

depo = {}
depo[2] = [1, 3, 5, 7, 9]
depo[3] = [1, 3, 5, 7, 9]
depo[4] = [1, 3, 5, 7, 9]
depo[5] = [1, 3, 5, 7, 9]
depo[6] = [1, 3, 5, 7, 9]
depo[7] = [1, 3, 5, 7, 9]
depo[8] = [1, 3, 5, 7, 9]
depo[9] = [1, 3, 5, 7, 9]
depo[10] = [1, 3, 5, 7, 9]

for row_index in range(2, 11):
    for column_index in depo[row_index]:
        rewards[row_index, column_index] = -100.
                
def define_order_states(file_path):
    order_list = pd.read_csv("C:/Users/kullanıcı/Desktop/siparis_listesi.csv")

    # Sipariş durumlarının ödülünü belirle
    order_rewards = 100

    # Sipariş listesindeki her satır için
    for index, row in order_list.iterrows():
        # Ürünün satır ve sütun numaralarını al
        product_row = row['Satır']
        product_column = row['Sütun']

        # Ödüller tablosunda ilgili hücrenin ödülünü sipariş ödülü olarak güncelle
        rewards[product_row, product_column] = order_rewards

# Dosya yolu parametresiyle fonksiyonu çağır
define_order_states("C:/Users/kullanıcı/Desktop/siparis_listesi.csv")

def get_starting_location():
  current_row_index = 0
  current_column_index = 5
  return current_row_index, current_column_index 

def get_next_action(current_row_index, current_column_index, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(4)
    else:
        return np.argmax(q_values[current_row_index, current_column_index])

def get_next_location(current_row_index, current_column_index, action_index):
    new_row_index = current_row_index
    new_column_index = current_column_index

    if actions[action_index] == 'up' and current_row_index > 0:
        new_row_index -= 1
    elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:
        new_column_index += 1
    elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:
        new_row_index += 1
    elif actions[action_index] == 'left' and current_column_index > 0:
        new_column_index -= 1

    return new_row_index, new_column_index

# Learning loop
for episode in range(num_episodes):
    # Get starting location for the episode
    current_state = get_starting_location()
    is_terminal = False

    while not is_terminal:
        # Choose action using epsilon-greedy strategy
        action = get_next_action(current_state[0], current_state[1], epsilon)

        # Get next location based on the chosen action
        next_state = get_next_location(current_state[0], current_state[1], action)

        # Get immediate reward for the next state
        immediate_reward = rewards[next_state[0], next_state[1]]

        # Update Q-value for the current state and action using Bellman equation
        q_values[current_state[0], current_state[1], action] += alpha * (
                immediate_reward + gamma * np.max(q_values[next_state[0], next_state[1]]) -
                q_values[current_state[0], current_state[1], action])

        # Move to the next state
        current_state = next_state

        # Check if the agent reached the terminal state
        if rewards[current_state[0], current_state[1]] == 100 or rewards[current_state[0], current_state[1]] == -100:
            
            is_terminal = True 
        
def reset_order_rewards():
    global rewards
    rewards = np.full((environment_rows, environment_columns), -1.)
    rewards[0, 5] = 100.
    for row_index in range(2, 11):
        for column_index in depo[row_index]:
            rewards[row_index, column_index] = -100.
    # Başlangıç noktasına dönme ödülünü ayarlayalım
    rewards[0, 5] = 50.  # Örneğin, 50 puan 

# Optimal politikayı takip etmek ve başlangıç noktasına geri dönmek için
def follow_optimal_policy():
    current_state = get_starting_location()
    total_reward = 0
    route = [current_state]

    while True:
        action = np.argmax(q_values[current_state[0], current_state[1]])
        next_state = get_next_location(current_state[0], current_state[1], action)
        immediate_reward = rewards[next_state[0], next_state[1]]
        total_reward += immediate_reward
        current_state = next_state
        route.append(current_state)

        print(f"Seçilen aksiyon: {actions[action]}, Yeni durum: {current_state}")

        if immediate_reward == 100 or immediate_reward == -100 or current_state == get_starting_location():
            break

    return total_reward, route

# Eğitim döngüsü
for episode in range(num_episodes):
    # Epsilon değerini her bölümde %10 azaltın
    epsilon = epsilon * 0.9

    current_state = get_starting_location()
    is_terminal = False

    while not is_terminal:
        action = get_next_action(current_state[0], current_state[1], epsilon)
        next_state = get_next_location(current_state[0], current_state[1], action)
        immediate_reward = rewards[next_state[0], next_state[1]]

        q_values[current_state[0], current_state[1], action] += alpha * (
                immediate_reward + gamma * np.max(q_values[next_state[0], next_state[1]]) -
                q_values[current_state[0], current_state[1], action])

        current_state = next_state

        if rewards[current_state[0], current_state[1]] == 100 or rewards[current_state[0], current_state[1]] == -100:
            is_terminal = True

            # Terminal durum için Q-tablosunu güncelleyin (gerekiyorsa)
            if rewards[current_state[0], current_state[1]] == 100:
                q_values[current_state[0], current_state[1]] = 100  # Veya terminal durum için uygun bir değer
                
# Optimal politikayı takip etmek ve başlangıç noktasına geri dönmek için
total_reward, route = follow_optimal_policy()
print(f"Toplam ödül: {total_reward}")
print(f"Optimal rota: {route}")

0

There are 0 best solutions below