I have a 12x11 matrix. I want my agent to start from row 0, column 5 (reward points 1) and collect 10 products (reward points 100) and return to the starting point again. The problem is that once the agency starts its action it only collects 1 reward and stays there. I want it to collect the remaining 10 products and take them to row 0, in short, 5. What is my mistake?
import numpy as np
import pandas as pd
environment_rows = 12
environment_columns = 11
# Hyperparameters
alpha = 0.1 # learning rate
gamma = 0.8 # discount factor
epsilon = 0.99 # exploration rate
num_episodes = 1000 # number of episodes for training
actions = ['up', 'right', 'down', 'left']
q_values = np.zeros((environment_rows, environment_columns, 4))
rewards = np.full((environment_rows, environment_columns), -5.)
rewards[0, 5] = 1
depo = {}
depo[2] = [1, 3, 5, 7, 9]
depo[3] = [1, 3, 5, 7, 9]
depo[4] = [1, 3, 5, 7, 9]
depo[5] = [1, 3, 5, 7, 9]
depo[6] = [1, 3, 5, 7, 9]
depo[7] = [1, 3, 5, 7, 9]
depo[8] = [1, 3, 5, 7, 9]
depo[9] = [1, 3, 5, 7, 9]
depo[10] = [1, 3, 5, 7, 9]
for row_index in range(2, 11):
for column_index in depo[row_index]:
rewards[row_index, column_index] = -100.
def define_order_states(file_path):
order_list = pd.read_csv("C:/Users/kullanıcı/Desktop/siparis_listesi.csv")
# Sipariş durumlarının ödülünü belirle
order_rewards = 100
# Sipariş listesindeki her satır için
for index, row in order_list.iterrows():
# Ürünün satır ve sütun numaralarını al
product_row = row['Satır']
product_column = row['Sütun']
# Ödüller tablosunda ilgili hücrenin ödülünü sipariş ödülü olarak güncelle
rewards[product_row, product_column] = order_rewards
# Dosya yolu parametresiyle fonksiyonu çağır
define_order_states("C:/Users/kullanıcı/Desktop/siparis_listesi.csv")
def get_starting_location():
current_row_index = 0
current_column_index = 5
return current_row_index, current_column_index
def get_next_action(current_row_index, current_column_index, epsilon):
if np.random.random() < epsilon:
return np.random.randint(4)
else:
return np.argmax(q_values[current_row_index, current_column_index])
def get_next_location(current_row_index, current_column_index, action_index):
new_row_index = current_row_index
new_column_index = current_column_index
if actions[action_index] == 'up' and current_row_index > 0:
new_row_index -= 1
elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:
new_column_index += 1
elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:
new_row_index += 1
elif actions[action_index] == 'left' and current_column_index > 0:
new_column_index -= 1
return new_row_index, new_column_index
# Learning loop
for episode in range(num_episodes):
# Get starting location for the episode
current_state = get_starting_location()
is_terminal = False
while not is_terminal:
# Choose action using epsilon-greedy strategy
action = get_next_action(current_state[0], current_state[1], epsilon)
# Get next location based on the chosen action
next_state = get_next_location(current_state[0], current_state[1], action)
# Get immediate reward for the next state
immediate_reward = rewards[next_state[0], next_state[1]]
# Update Q-value for the current state and action using Bellman equation
q_values[current_state[0], current_state[1], action] += alpha * (
immediate_reward + gamma * np.max(q_values[next_state[0], next_state[1]]) -
q_values[current_state[0], current_state[1], action])
# Move to the next state
current_state = next_state
# Check if the agent reached the terminal state
if rewards[current_state[0], current_state[1]] == 100 or rewards[current_state[0], current_state[1]] == -100:
is_terminal = True
def reset_order_rewards():
global rewards
rewards = np.full((environment_rows, environment_columns), -1.)
rewards[0, 5] = 100.
for row_index in range(2, 11):
for column_index in depo[row_index]:
rewards[row_index, column_index] = -100.
# Başlangıç noktasına dönme ödülünü ayarlayalım
rewards[0, 5] = 50. # Örneğin, 50 puan
# Optimal politikayı takip etmek ve başlangıç noktasına geri dönmek için
def follow_optimal_policy():
current_state = get_starting_location()
total_reward = 0
route = [current_state]
while True:
action = np.argmax(q_values[current_state[0], current_state[1]])
next_state = get_next_location(current_state[0], current_state[1], action)
immediate_reward = rewards[next_state[0], next_state[1]]
total_reward += immediate_reward
current_state = next_state
route.append(current_state)
print(f"Seçilen aksiyon: {actions[action]}, Yeni durum: {current_state}")
if immediate_reward == 100 or immediate_reward == -100 or current_state == get_starting_location():
break
return total_reward, route
# Eğitim döngüsü
for episode in range(num_episodes):
# Epsilon değerini her bölümde %10 azaltın
epsilon = epsilon * 0.9
current_state = get_starting_location()
is_terminal = False
while not is_terminal:
action = get_next_action(current_state[0], current_state[1], epsilon)
next_state = get_next_location(current_state[0], current_state[1], action)
immediate_reward = rewards[next_state[0], next_state[1]]
q_values[current_state[0], current_state[1], action] += alpha * (
immediate_reward + gamma * np.max(q_values[next_state[0], next_state[1]]) -
q_values[current_state[0], current_state[1], action])
current_state = next_state
if rewards[current_state[0], current_state[1]] == 100 or rewards[current_state[0], current_state[1]] == -100:
is_terminal = True
# Terminal durum için Q-tablosunu güncelleyin (gerekiyorsa)
if rewards[current_state[0], current_state[1]] == 100:
q_values[current_state[0], current_state[1]] = 100 # Veya terminal durum için uygun bir değer
# Optimal politikayı takip etmek ve başlangıç noktasına geri dönmek için
total_reward, route = follow_optimal_policy()
print(f"Toplam ödül: {total_reward}")
print(f"Optimal rota: {route}")