Сеть CartPole Deep Q только ухудшается, убытки растут
Я попытался решить среду CartPole из тренажерного зала OpenAI, но моя модель, похоже, только ухудшилась. Это двойная (глубокая) сеть Q, которую я попытался собрать из нескольких ресурсов, но я не уверен, в чем моя ошибка. Я пробовал изменить количество серий, оптимизатор, memory_size, batch_size, но ничего не помогло. График потерь График результатов
import gym
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class ReplayMemory():
def __init__(self, capacity):
self.state_memory = []
self.action_memory = np.array([], dtype=np.int64)
self.reward_memory = np.array([], dtype=np.float32)
self.new_state_memory = []
self.done_memory = np.array([], dtype=np.bool)
self.capacity = capacity
self.store_count = 0
def store(self, experience):
if len(self.state_memory) == self.capacity:
idx = self.store_count % self.capacity
self.state_memory[idx] = experience[0]
self.action_memory[idx] = experience[1]
self.reward_memory[idx] = experience[2]
self.new_state_memory[idx] = experience[3]
self.done_memory[idx] = experience[4]
return
self.state_memory.append(experience[0])
self.action_memory = np.append(self.action_memory, experience[1])
self.reward_memory = np.append(self.reward_memory, experience[2])
self.new_state_memory.append(experience[3])
self.done_memory = np.append(self.done_memory, experience[4])
def sample(self, batch_size):
indeces = np.random.choice(len(self.state_memory), size=batch_size, replace=False)
states = torch.tensor(np.array(self.state_memory, dtype=np.float32)[indeces])
actions = torch.tensor(self.action_memory[indeces])
rewards = torch.tensor(self.reward_memory[indeces])
new_states = torch.tensor(np.array(self.new_state_memory, dtype=np.float32)[indeces])
dones = torch.tensor(self.done_memory[indeces])
return states, actions, rewards, new_states, dones
def can_provide_sample(self, batch_size):
return len(self.state_memory) >= batch_size
class Agent():
def __init__(self):
self.eps = 1
self.eps_end = 0.01
self.eps_start = 1
self.eps_dec = 0.9999
def choose_action(self, state, online_network):
if random.random() > self.eps:
tensor_state = torch.tensor([state], dtype=torch.float)
return online_network(tensor_state).argmax().item()
return random.randint(0, 1)
def decrease_eps(self, step, max_steps):
self.eps = max(self.eps_end, self.eps_dec * self.eps)
memory = ReplayMemory(10_000)
online_network = nn.Sequential(
nn.Linear(4, 12),
nn.ReLU(),
nn.Linear(12, 8),
nn.ReLU(),
nn.Linear(8, 2))
target_network = nn.Sequential(
nn.Linear(4, 12),
nn.ReLU(),
nn.Linear(12, 8),
nn.ReLU(),
nn.Linear(8, 2))
online_network.float()
target_network.float()
target_network.load_state_dict(online_network.state_dict())
target_network.eval() # not going to be trained
optimizer = optim.Adam(params=online_network.parameters(), lr=0.001)
agent = Agent()
batch_size = 128
gamma = 0.99
update_target_every = 1000
env = gym.make('CartPole-v0')
def plot():
plt.plot(scores)
plt.ylabel('Score')
plt.show()
plt.plot(losses)
plt.ylabel('Loss')
plt.show()
num_episodes = 2000
scores, losses, epses = [], [], []
step_count = 0
loss = torch.tensor(-100) # just a random number so I can print it even in the beginning
for ep in range(num_episodes):
state = env.reset()
done = False
score = 0.0
while not done:
action = agent.choose_action(state, online_network)
epses.append(agent.eps) # logging
agent.decrease_eps(ep, num_episodes)
next_state, reward, done, info = env.step(action)
memory.store((state, action, np.float32(reward), next_state, done))
state = next_state
score += reward
if memory.can_provide_sample(batch_size):
if step_count % update_target_every == 0:
target_network.load_state_dict(online_network.state_dict())
states, actions, rewards, next_states, dones = memory.sample(batch_size)
q_pred = online_network(states).max(dim=1)[0]
q_next = target_network(next_states).max(dim=1)[0]
q_next[dones] = 0.0
q_target = rewards + gamma * q_next
# learning
loss = F.mse_loss(q_target, q_pred)
losses.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
step_count += 1
# logging
scores.append(score)
avg_score = np.mean(scores[-100:])
print(f'episode {ep}: score: {score}, avg score: {avg_score}, \
eps: {agent.eps}, loss: {loss.item()}')
plot()
env.close()