Актер-критик модель никогда не сходится
Я пытаюсь реализовать Actor-Critic, используя Keras & Tensorflow. Однако, это никогда не сходится, и я не могу понять, почему. Я уменьшил скорость обучения, но она не изменилась.
Код находится в python3.5.1 и tenorflow1.2.1
import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
from keras import backend as K
env = gym.make('CartPole-v0')
NUM_STATE = env.env.observation_space.shape[0]
NUM_ACTIONS = env.env.action_space.n
LEARNING_RATE = 0.0005
TARGET_AVG_REWARD = 195
class Actor_Critic():
def __init__(self):
l_input = Input(shape=(NUM_STATE, ))
l_dense = Dense(16, activation='relu')(l_input)
## Policy Network
action_probs = Dense(NUM_ACTIONS, activation='softmax')(l_dense)
policy_network = Model(input=l_input, output=action_probs)
## Value Network
state_value = Dense(1, activation='linear')(l_dense)
value_network = Model(input=l_input, output=state_value)
graph = self._build_graph(policy_network, value_network)
self.state, self.action, self.target, self.action_probs, self.state_value, self.minimize, self.loss = graph
def _build_graph(self, policy_network, value_network):
state = tf.placeholder(tf.float32)
action = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
target = tf.placeholder(tf.float32, shape=(None))
action_probs = policy_network(state)
state_value = value_network(state)[0]
advantage = tf.stop_gradient(target) - state_value
log_prob = tf.log(tf.reduce_sum(action_probs * action, reduction_indices=1))
p_loss = -log_prob * advantage
v_loss = tf.reduce_mean(tf.square(advantage))
loss = p_loss + (0.5 * v_loss)
# optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
minimize = optimizer.minimize(loss)
return state, action, target, action_probs, state_value, minimize, loss,
def predict_policy(self, sess, state):
return sess.run(self.action_probs, { self.state: [state] })
def predict_value(self, sess, state):
return sess.run(self.state_value, { self.state: [state] })
def update(self, sess, state, action, target):
feed_dict = {self.state:[state], self.target:target, self.action:to_categorical(action, NUM_ACTIONS)}
_, loss = sess.run([self.minimize, self.loss], feed_dict)
return loss
def train(env, sess, estimator, num_episodes, discount_factor=1.0):
Transition = collections.namedtuple("Transition", ["state", "action", "reward", "loss"])
last_100 = np.zeros(100)
for i_episode in range(num_episodes):
# Reset the environment and pick the fisrst action
state = env.reset()
episode = []
# One step in the environment
for t in itertools.count():
# Take a step
action_probs = estimator.predict_policy(sess, state)[0]
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
next_state, reward, done, _ = env.step(action)
target = reward + (0 if done else discount_factor * estimator.predict_value(sess, next_state))
# Update our policy estimator
loss = estimator.update(sess, state, action, target)
# Keep track of the transition
episode.append(Transition(state=state, action=action, reward=reward, loss=loss))
if done:
break
state = next_state
total_reward = sum(e.reward for e in episode)
last_100[i_episode % 100] = total_reward
last_100_avg = sum(last_100) / 100
total_loss = sum(e.loss for e in episode)
print('episode %s loss: %f reward: %f last 100: %f' % (i_episode, total_loss, total_reward, last_100_avg))
if last_100_avg >= TARGET_AVG_REWARD:
break
return
estimator = Actor_Critic()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
stats = train(env, sess, estimator, 2000, discount_factor=0.99)
Вот журнал в начале:(последние 100 - это средняя награда за последние 100 эпизодов. Она автоматически увеличивается в первых 100 эпизодах, поэтому игнорируйте ее.)
episode 0 loss: 17.662344 reward: 15.000000 last 100: 0.150000
episode 1 loss: 15.319713 reward: 13.000000 last 100: 0.280000
episode 2 loss: 38.097054 reward: 32.000000 last 100: 0.600000
episode 3 loss: 22.229492 reward: 19.000000 last 100: 0.790000
episode 4 loss: 31.027534 reward: 26.000000 last 100: 1.050000
episode 5 loss: 21.037663 reward: 18.000000 last 100: 1.230000
episode 6 loss: 18.750641 reward: 16.000000 last 100: 1.390000
episode 7 loss: 23.268227 reward: 20.000000 last 100: 1.590000
episode 8 loss: 27.251028 reward: 23.000000 last 100: 1.820000
episode 9 loss: 20.008078 reward: 17.000000 last 100: 1.990000
episode 10 loss: 28.213932 reward: 24.000000 last 100: 2.230000
episode 11 loss: 28.109922 reward: 23.000000 last 100: 2.460000
episode 12 loss: 25.068121 reward: 21.000000 last 100: 2.670000
episode 13 loss: 59.581238 reward: 50.000000 last 100: 3.170000
episode 14 loss: 26.618759 reward: 22.000000 last 100: 3.390000
episode 15 loss: 28.847467 reward: 24.000000 last 100: 3.630000
episode 16 loss: 22.534216 reward: 17.000000 last 100: 3.800000
episode 17 loss: 19.760979 reward: 15.000000 last 100: 3.950000
episode 18 loss: 31.018209 reward: 25.000000 last 100: 4.200000
episode 19 loss: 22.938683 reward: 16.000000 last 100: 4.360000
episode 20 loss: 30.372072 reward: 24.000000 last 100: 4.600000
После 500 эпизодов он не только не улучшается, но и на самом деле хуже, чем в начале.
episode 501 loss: 97.043335 reward: 8.000000 last 100: 13.500000
episode 502 loss: 101.957603 reward: 11.000000 last 100: 13.510000
episode 503 loss: 100.277809 reward: 11.000000 last 100: 13.520000
episode 504 loss: 96.754257 reward: 9.000000 last 100: 13.510000
episode 505 loss: 99.436943 reward: 11.000000 last 100: 13.530000
episode 506 loss: 105.161621 reward: 16.000000 last 100: 13.580000
episode 507 loss: 65.993591 reward: 12.000000 last 100: 13.610000
episode 508 loss: 59.837429 reward: 9.000000 last 100: 13.600000
episode 509 loss: 92.478806 reward: 9.000000 last 100: 13.570000
episode 510 loss: 96.697289 reward: 14.000000 last 100: 13.620000
episode 511 loss: 94.611366 reward: 10.000000 last 100: 13.620000
episode 512 loss: 100.259460 reward: 15.000000 last 100: 13.680000
episode 513 loss: 88.776451 reward: 10.000000 last 100: 13.690000
episode 514 loss: 86.659203 reward: 9.000000 last 100: 13.700000
episode 515 loss: 105.494476 reward: 17.000000 last 100: 13.770000
episode 516 loss: 90.662186 reward: 12.000000 last 100: 13.770000
episode 517 loss: 90.777634 reward: 12.000000 last 100: 13.810000
episode 518 loss: 91.290558 reward: 14.000000 last 100: 13.860000
episode 519 loss: 94.902023 reward: 11.000000 last 100: 13.870000
episode 520 loss: 86.746582 reward: 12.000000 last 100: 13.900000
С другой стороны, простой градиент политики действительно сходится.
import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
from keras import backend as K
env = gym.make('CartPole-v0')
NUM_STATE = env.env.observation_space.shape[0]
NUM_ACTIONS = env.env.action_space.n
LEARNING_RATE = 0.0005
TARGET_AVG_REWARD = 195
class PolicyEstimator():
"""
Policy Function approximator.
"""
def __init__(self):
l_input = Input(shape=(NUM_STATE, ))
l_dense = Dense(16, activation='relu')(l_input)
action_probs = Dense(NUM_ACTIONS, activation='softmax')(l_dense)
model = Model(inputs=[l_input], outputs=[action_probs])
self.state, self.action, self.target, self.action_probs, self.minimize, self.loss = self._build_graph(model)
def _build_graph(self, model):
state = tf.placeholder(tf.float32)
action = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
target = tf.placeholder(tf.float32, shape=(None))
action_probs = model(state)
log_prob = tf.log(tf.reduce_sum(action_probs * action, reduction_indices=1))
loss = -log_prob * target
# optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
minimize = optimizer.minimize(loss)
return state, action, target, action_probs, minimize, loss
def predict(self, sess, state):
return sess.run(self.action_probs, { self.state: [state] })
def update(self, sess, state, action, target):
feed_dict = {self.state:[state], self.target:[target], self.action:to_categorical(action, NUM_ACTIONS)}
_, loss = sess.run([self.minimize, self.loss], feed_dict)
return loss
def train(env, sess, estimator_policy, num_episodes, discount_factor=1.0):
Transition = collections.namedtuple("Transition", ["state", "action", "reward"])
last_100 = np.zeros(100)
for i_episode in range(num_episodes):
# Reset the environment and pick the fisrst action
state = env.reset()
episode = []
# One step in the environment
for t in itertools.count():
# Take a step
action_probs = estimator_policy.predict(sess, state)[0]
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
next_state, reward, done, _ = env.step(action)
# Keep track of the transition
episode.append(Transition(state=state, action=action, reward=reward))
if done:
break
state = next_state
# Go through the episode and make policy updates
for t, transition in enumerate(episode):
# The return after this timestep
target = sum(discount_factor**i * t2.reward for i, t2 in enumerate(episode[t:]))
# Update our policy estimator
loss = estimator_policy.update(sess, transition.state, transition.action, target)
total_reward = sum(e.reward for e in episode)
last_100[i_episode % 100] = total_reward
last_100_avg = sum(last_100) / 100
print('episode %s reward: %f last 100: %f' % (i_episode, total_reward, last_100_avg))
if last_100_avg >= TARGET_AVG_REWARD:
break
return
policy_estimator = PolicyEstimator()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
stats = train(env, sess, policy_estimator, 2000, discount_factor=1.0)
Код ссылки
https://github.com/jaara/AI-blog/blob/master/CartPole-A3C.py
https://github.com/coreylynch/async-rl
Любая помощь приветствуется.
[Обновить]
Я изменил код в _build_graph
от
advantage = tf.stop_gradient(target) - state_value
log_prob = tf.log(tf.reduce_sum(action_probs * action, reduction_indices=1))
p_loss = -log_prob * advantage
v_loss = tf.reduce_mean(tf.square(advantage))
loss = p_loss + (0.5 * v_loss)
в
advantage = target - state_value
log_prob = tf.log(tf.reduce_sum(action_probs * action, reduction_indices=1))
p_loss = -log_prob * tf.stop_gradient(advantage)
v_loss = 0.5 * tf.reduce_mean(tf.square(advantage))
loss = p_loss + v_loss
Стало лучше и много получало 200 наград (максимум). Однако после 4000 эпизодов он все еще не достиг среднего уровня 195.
2 ответа
Первая очевидная вещь - неправильный градиент, остановленный в преимуществе:
advantage = tf.stop_gradient(target) - state_value
должно быть
advantage = target - tf.stop_gradient(state_value)
Поскольку для цели в любом случае нет градиента (это константа), и вам нужно добиться отсутствия градиента, проходящего через сеть значений (базовую линию) для градиента политики. У вас есть отдельная потеря для базовой линии (которая выглядит хорошо).
Другая возможная ошибка - это способ уменьшить потери. Вы явно вызываете Reduce_mean для v_loss, но никогда для p_loss. Следовательно, масштабирование отключено, и ваша стоимостная сеть, вероятно, учится намного медленнее (поскольку вы усредняете по первому, вероятно, временному измерению).
Пара предложений:
- отрицательная награда. Вам нужно как-то "оштрафовать" состояния терминала. (например, вы можете жестко наградить вознаграждение с
if done: reward = -10
.) иначе критик никогда не оценит отрицательные значения для терминальных состояний. без отрицательных ценностей плохие поступки никогда не препятствуют. - минимизация потерь. у тебя должно быть два
minimize
операции: одна для актера, другая для критика. в идеале критик должен иметь более высокую скорость обучения, чем у актера.