Попытайтесь реализовать алгоритм итерации значений, но получите неразрешимую ошибку

Question

Попытайтесь реализовать алгоритм итерации значений, но получите неразрешимую ошибку

Я пытаюсь реализовать алгоритм итерации значений прямо сейчас. Задача в том, чтобы этот алгоритм был независимым, поэтому его можно было повторно использовать для разных задач (многоразово). Поэтому я написал код объектно-ориентированный. При его выполнении я получаю следующее сообщение об ошибке:

  File "/Users", line 81, in <module>
    policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
  File "/Users", line 40, in value_iteration
    V = dict.fromkeys(action_space, 0.0)
NameError: name 'action_space' is not defined  ```
Traceback (most recent call last):
  File "/Users", line 81, in <module>
    policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
  File "/Users", line 40, in value_iteration
    V = dict.fromkeys(action_space, 0.0)
NameError: name 'action_space' is not defined

Я действительно все перепробовала. Однако я не могу решить проблему. Я предоставил вам полезный код, чтобы вы могли исправить ошибку самостоятельно и, возможно, найти решение. Я благодарен за любую помощь.

алгоритм итерации значений показан ниже. В реализации используется вспомогательная функция one_step_lookahead()

import numpy as np
import matplotlib.pyplot as plt

#value_iteration.py

def value_iteration(env, theta, discount_factor, multipleOptimalActions = False, optimalPolicyPrecision=0.0):
    """
    #Value Iteration Algorithm.py

    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment.
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.

    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """

    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.

        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS

        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        action_space = env.get_action_space(state)#ändern ?
        action_values = dict.fromkeys(action_space, 0.0)#ändern ?
        for action in action_space:
            for prob, next_state, reward, done in env.OSD(state, action):
                action_values[action] += prob * (reward + discount_factor * V[next_state])
        return action_values

    state_space = env.get_state_space()
    V = dict.fromkeys(action_space, 0.0)
    while True:
        # Stopping condition
        delta = 0
        # Update each state...
        for state in state_space:
            # Do a one-step lookahead to find the best action
            action_values = one_step_lookahead(state, V)
            best_action_value = np.max(list(actions_values.values()))
            # Calculate delta across all states seen so far
            delta = max(delta, np.abs(best_action_value - V[state]))
            # Update the value function. Ref: Sutton book eq. 4.10.
            V[state] = best_action_value
            # Check if we can stop
        if delta < theta:
            break

    # Create a deterministic policy using the optimal value function
    policy = dict.fromkeys(env.get_state_space())
    for state in policy.keys():
        # One step lookahead to find the best action for this state
        action_values = one_step_lookahead(state, V)
        if multipleOptimalActions:
            best_action_value = np.max(list(action_values.values()))
            best_action = list()
            for action, value in action_values.items():
                if np.abs(best_action_value - value) <= optimalPolicyPrecision:
                    best_action.append(action)
        else:
            index = np.argmax(list(actions_values.values()))
            best_action = [list(action_values.keys())[index]]
            # Always take the best action

        policy[state] = best_action

    return policy, V

Алгоритм итерации значений использует суперкласс Environment.py. Реализованный класс Grid наследует от Environment:

class Environment:
    def __init__(self, state_space, action_space=None):

        self.state_space = state_space
        if action_space:
            self.action_space = action_space

    def OSD(self, state, action):

        if state not in self.get_state_space():
            raise ValueError("state is invalid", state)
        if action not in self.get_action_space(state):
            raise ValueError("action is invalid", action)
        osd = []
        for prob, next_state in self.pss_a(state, action):
            reward = self.rss_a(state, next_state, action)
            osd.append((prob, next_state, reward))
        return osd

    def get_state_space(self):
        return self.state_space

    def get_action_space(self,state):
        if state not in self.get_state_space():
            raise ValueError("state is invalid", state)
        return self.action_space

    def rss_a(self, state, next_state, action):

        if state not in self.get_state_space():
            raise ValueError("state ist invalid", state)
        if next_state not in self.get_state_space():
            raise ValueError("next_state is invalid", next_state)
        if action not in self.get_action_space(state):
            raise ValueError("action is invalid", action)
        return 0

    def pss_a(self, current_state, action):
        if current_state not in self.get_state_space():
            raise ValueError("state is invalid", current_state)
        if action not in self.get_action_space(current_state):
            raise ValueError("action is invalid", action)

        prob, next_state = 0.0

        return [(prob, next_state)]

#from value_iteration import value_iteration
#from Environment import Environment

STATES = range(16)

FORWARD = "forward"
RIGHT = "right"
LEFT = "left"
ACTIONS = [FORWARD, RIGHT, LEFT]

terminal_states = [15]

def get_next_state(state, action):
    if state not in STATES:
        raise ValueError("State isnt known")
    if action not in ACTIONS:
        raise ValueError("Action isnt known")
    next_state = state

    if state in terminal_states:
        return state
    if action is RIGHT:
        next_state = state +1
        if next_state % 4 is 0:
            next_state = next_state -4
    elif action is LEFT:
        next_state = state -1
        if (next_state+1)%4 is 0:
            next_state = next_state +4
    elif action is FORWARD:
        if state % 2 is 0 or state in [3,13]:
            next_state = state
        elif (state -1) %4 is 0:
            next_state = state +4
        elif (state +1) % 4 is 0:
            next_state = state -4
    else:
        raise Exception("some error")

    return next_state

class Grid(Environment):

    def __init__(self):
        super().__init__(state_space=STATES, action_space=ACTIONS)
        '''self.state_space = state_space
        if action_space:
            self.action_space = action_space'''


    def rss_a(self, state, next_state, action):

        if state not in self.get_state_space():
            raise ValueError("state ist invalid", state)
        if next_state not in self.get_state_space():
            raise ValueError("next_state is invalid", next_state)
        if action not in self.get_action_space(state):
            raise ValueError("action is invalid", action)
        if state is not next_state and next_state in terminal_states:
            return 1
        elif state is next_state and next_state not in terminal_states:
            return -1
        else:
            return 1

    def pss_a(self, current_state, action):
        if current_state not in self.get_state_space():
            raise ValueError("state is invalid", current_state)
        if action not in self.get_action_space(current_state):
            raise ValueError("action is invalid", action)
        next_state = get_next_state(current_state, action)
        if next_state in STATES:
            prob = 1
        else:
            prob = 0
        return [(prob, next_state)]


if __name__== "__main__":
    env = Grid()
    policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
    policy, V = value_iteration(env, theta = 1e-8, discount_factor = 0.8)
    print(V)
    print(policy)

0

python reinforcement-learning object-oriented-analysis value-iteration

Источник

user12775187 17 май '20 в 22:29

0 ответов

Другие вопросы по тегам python reinforcement-learning object-oriented-analysis value-iteration