How can I edit this Q-learning code's logic playing Chopsticks(hand game)?

27 Views Asked by At
def reverse_state(state): # reverse the state
    return np.array([state[1, :],state[0, :]])

def env(Q, epsilon, eta, gamma, pi):
    state = np.array([[1, 1], [1, 1]]) # 초기 상태 first state
    turn = 0 # 턴 수 turn number
    action = get_action(state, Q, epsilon, pi) # 초기 행동 first action

    while (1):
        turn += 1 # 턴 수

        # 다음 단계 state 구하기 get nextstate with action
        state_next = get_next_state(state, action) 

        # 보상 부여 후 다음 행동 계산 give reward and get next action
        if (state[1, :] == [0, 0]).all(): # 이긴 경우 # is win
            reward = 1
            action = np.nan
        else:
            reward = 0
            action = get_action(state_next, Q, epsilon, pi)

        # 가치함수 수정 edit Q function
        Q = Q_learning(state, action, reward, state_next, Q, eta, gamma)


        # 종료 여부 판정 is done
        if (state[0, :] == [0, 0]).all() or (state[1, :] == [0, 0]).all():
            break
        else:
            state = state_next
    return [turn, Q]

When the number of turn is an even number(when it is the opponent player's viewpoint), I want to reverse the state and then complete the calculations, and then reverst one more time to return to the original viewpoint before calculating on the next turn. I have no idea what to do here.

I tried it

def env(Q, epsilon, eta, gamma, pi): # 환경
    state = np.array([[1, 1], [1, 1]]) # 초기 상태
    turn = 0 # 턴 수
    action = get_action(state, Q, epsilon, pi) # 초기 행동

    while (1):
        turn += 1 # 턴 수
        if turn % 2 == 0:
            # 다음 단계 state 구하기
            state = reverse_state(state)
            state_next = get_next_state(state, action)

            # 보상 부여 후 다음 행동 계산
            if (state[1, :] == [0, 0]).all(): # 이긴 경우
                reward = 1
                action = np.nan
            else:
                reward = 0
                action = get_action(state_next, Q, epsilon, pi)

            # 가치함수 수정
            Q = Q_learning(state, action, reward, state_next, Q, eta, gamma)

        else:
            # 다음 단계 state 구하기
            state_next = get_next_state(state, action)
            # 보상 부여 후 다음 행동 계산
            if (state[1, :] == [0, 0]).all(): # 이긴 경우
                reward = 1
                action = np.nan
            else:
                reward = 0
                action = get_action(state_next, Q, epsilon, pi)

            # 가치함수 수정
            Q = Q_learning(state, action, reward, state_next, Q, eta, gamma)


        # 종료 여부 판정
        if (state[0, :] == [0, 0]).all() or (state[1, :] == [0, 0]).all():
            break
        else:
            if turn % 2 == 0:
                state = reverse_state(state_next)
            else:
                state = state_next
        print(state)
    return [turn, Q]

like this but it doesn't work.

0

There are 0 best solutions below