Попытайтесь реализовать алгоритм итерации значений, но получите неразрешимую ошибку
Я пытаюсь реализовать алгоритм итерации значений прямо сейчас. Задача в том, чтобы этот алгоритм был независимым, поэтому его можно было повторно использовать для разных задач (многоразово). Поэтому я написал код объектно-ориентированный. При его выполнении я получаю следующее сообщение об ошибке:
File "/Users", line 81, in <module>
policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
File "/Users", line 40, in value_iteration
V = dict.fromkeys(action_space, 0.0)
NameError: name 'action_space' is not defined ```
Traceback (most recent call last):
File "/Users", line 81, in <module>
policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
File "/Users", line 40, in value_iteration
V = dict.fromkeys(action_space, 0.0)
NameError: name 'action_space' is not defined
Я действительно все перепробовала. Однако я не могу решить проблему. Я предоставил вам полезный код, чтобы вы могли исправить ошибку самостоятельно и, возможно, найти решение. Я благодарен за любую помощь.
алгоритм итерации значений показан ниже. В реализации используется вспомогательная функция one_step_lookahead()
import numpy as np
import matplotlib.pyplot as plt
#value_iteration.py
def value_iteration(env, theta, discount_factor, multipleOptimalActions = False, optimalPolicyPrecision=0.0):
"""
#Value Iteration Algorithm.py
Args:
env: OpenAI env. env.P represents the transition probabilities of the environment.
env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
env.nS is a number of states in the environment.
env.nA is a number of actions in the environment.
theta: We stop evaluation once our value function change is less than theta for all states.
discount_factor: Gamma discount factor.
Returns:
A tuple (policy, V) of the optimal policy and the optimal value function.
"""
def one_step_lookahead(state, V):
"""
Helper function to calculate the value for all action in a given state.
Args:
state: The state to consider (int)
V: The value to use as an estimator, Vector of length env.nS
Returns:
A vector of length env.nA containing the expected value of each action.
"""
action_space = env.get_action_space(state)#ändern ?
action_values = dict.fromkeys(action_space, 0.0)#ändern ?
for action in action_space:
for prob, next_state, reward, done in env.OSD(state, action):
action_values[action] += prob * (reward + discount_factor * V[next_state])
return action_values
state_space = env.get_state_space()
V = dict.fromkeys(action_space, 0.0)
while True:
# Stopping condition
delta = 0
# Update each state...
for state in state_space:
# Do a one-step lookahead to find the best action
action_values = one_step_lookahead(state, V)
best_action_value = np.max(list(actions_values.values()))
# Calculate delta across all states seen so far
delta = max(delta, np.abs(best_action_value - V[state]))
# Update the value function. Ref: Sutton book eq. 4.10.
V[state] = best_action_value
# Check if we can stop
if delta < theta:
break
# Create a deterministic policy using the optimal value function
policy = dict.fromkeys(env.get_state_space())
for state in policy.keys():
# One step lookahead to find the best action for this state
action_values = one_step_lookahead(state, V)
if multipleOptimalActions:
best_action_value = np.max(list(action_values.values()))
best_action = list()
for action, value in action_values.items():
if np.abs(best_action_value - value) <= optimalPolicyPrecision:
best_action.append(action)
else:
index = np.argmax(list(actions_values.values()))
best_action = [list(action_values.keys())[index]]
# Always take the best action
policy[state] = best_action
return policy, V
Алгоритм итерации значений использует суперкласс Environment.py. Реализованный класс Grid наследует от Environment:
class Environment:
def __init__(self, state_space, action_space=None):
self.state_space = state_space
if action_space:
self.action_space = action_space
def OSD(self, state, action):
if state not in self.get_state_space():
raise ValueError("state is invalid", state)
if action not in self.get_action_space(state):
raise ValueError("action is invalid", action)
osd = []
for prob, next_state in self.pss_a(state, action):
reward = self.rss_a(state, next_state, action)
osd.append((prob, next_state, reward))
return osd
def get_state_space(self):
return self.state_space
def get_action_space(self,state):
if state not in self.get_state_space():
raise ValueError("state is invalid", state)
return self.action_space
def rss_a(self, state, next_state, action):
if state not in self.get_state_space():
raise ValueError("state ist invalid", state)
if next_state not in self.get_state_space():
raise ValueError("next_state is invalid", next_state)
if action not in self.get_action_space(state):
raise ValueError("action is invalid", action)
return 0
def pss_a(self, current_state, action):
if current_state not in self.get_state_space():
raise ValueError("state is invalid", current_state)
if action not in self.get_action_space(current_state):
raise ValueError("action is invalid", action)
prob, next_state = 0.0
return [(prob, next_state)]
#from value_iteration import value_iteration
#from Environment import Environment
STATES = range(16)
FORWARD = "forward"
RIGHT = "right"
LEFT = "left"
ACTIONS = [FORWARD, RIGHT, LEFT]
terminal_states = [15]
def get_next_state(state, action):
if state not in STATES:
raise ValueError("State isnt known")
if action not in ACTIONS:
raise ValueError("Action isnt known")
next_state = state
if state in terminal_states:
return state
if action is RIGHT:
next_state = state +1
if next_state % 4 is 0:
next_state = next_state -4
elif action is LEFT:
next_state = state -1
if (next_state+1)%4 is 0:
next_state = next_state +4
elif action is FORWARD:
if state % 2 is 0 or state in [3,13]:
next_state = state
elif (state -1) %4 is 0:
next_state = state +4
elif (state +1) % 4 is 0:
next_state = state -4
else:
raise Exception("some error")
return next_state
class Grid(Environment):
def __init__(self):
super().__init__(state_space=STATES, action_space=ACTIONS)
'''self.state_space = state_space
if action_space:
self.action_space = action_space'''
def rss_a(self, state, next_state, action):
if state not in self.get_state_space():
raise ValueError("state ist invalid", state)
if next_state not in self.get_state_space():
raise ValueError("next_state is invalid", next_state)
if action not in self.get_action_space(state):
raise ValueError("action is invalid", action)
if state is not next_state and next_state in terminal_states:
return 1
elif state is next_state and next_state not in terminal_states:
return -1
else:
return 1
def pss_a(self, current_state, action):
if current_state not in self.get_state_space():
raise ValueError("state is invalid", current_state)
if action not in self.get_action_space(current_state):
raise ValueError("action is invalid", action)
next_state = get_next_state(current_state, action)
if next_state in STATES:
prob = 1
else:
prob = 0
return [(prob, next_state)]
if __name__== "__main__":
env = Grid()
policy, V = value_iteration(env, theta=1e-8, discount_factor=0.8, multipleOptimalActions=True)
policy, V = value_iteration(env, theta = 1e-8, discount_factor = 0.8)
print(V)
print(policy)