Сеть CartPole Deep Q только ухудшается, убытки растут

Я попытался решить среду CartPole из тренажерного зала OpenAI, но моя модель, похоже, только ухудшилась. Это двойная (глубокая) сеть Q, которую я попытался собрать из нескольких ресурсов, но я не уверен, в чем моя ошибка. Я пробовал изменить количество серий, оптимизатор, memory_size, batch_size, но ничего не помогло. График потерь График результатов

import gym
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ReplayMemory():
  def __init__(self, capacity):
    self.state_memory = []
    self.action_memory = np.array([], dtype=np.int64)
    self.reward_memory = np.array([], dtype=np.float32)
    self.new_state_memory = []
    self.done_memory = np.array([], dtype=np.bool)
    self.capacity = capacity
    self.store_count = 0

  def store(self, experience):
    if len(self.state_memory) == self.capacity:
      idx = self.store_count % self.capacity
      self.state_memory[idx] = experience[0]
      self.action_memory[idx] = experience[1]
      self.reward_memory[idx] = experience[2]
      self.new_state_memory[idx] = experience[3]
      self.done_memory[idx] = experience[4]
      return
    
    self.state_memory.append(experience[0])
    self.action_memory = np.append(self.action_memory, experience[1])
    self.reward_memory = np.append(self.reward_memory, experience[2])
    self.new_state_memory.append(experience[3])
    self.done_memory = np.append(self.done_memory, experience[4])
      
  def sample(self, batch_size):
    indeces = np.random.choice(len(self.state_memory), size=batch_size, replace=False)
    states = torch.tensor(np.array(self.state_memory, dtype=np.float32)[indeces])
    actions = torch.tensor(self.action_memory[indeces])
    rewards = torch.tensor(self.reward_memory[indeces])
    new_states = torch.tensor(np.array(self.new_state_memory, dtype=np.float32)[indeces])
    dones = torch.tensor(self.done_memory[indeces])
    return states, actions, rewards, new_states, dones

  def can_provide_sample(self, batch_size):
    return len(self.state_memory) >= batch_size

class Agent():
  def __init__(self):
    self.eps = 1
    self.eps_end = 0.01
    self.eps_start = 1
    self.eps_dec = 0.9999
  
  def choose_action(self, state, online_network):
    if random.random() > self.eps:
      tensor_state = torch.tensor([state], dtype=torch.float)
      return online_network(tensor_state).argmax().item()
    return random.randint(0, 1)
    
  def decrease_eps(self, step, max_steps):
    self.eps = max(self.eps_end, self.eps_dec * self.eps)


memory = ReplayMemory(10_000)

online_network = nn.Sequential(
  nn.Linear(4, 12),
  nn.ReLU(),
  nn.Linear(12, 8),
  nn.ReLU(),
  nn.Linear(8, 2))

target_network = nn.Sequential(
  nn.Linear(4, 12),
  nn.ReLU(),
  nn.Linear(12, 8),
  nn.ReLU(),
  nn.Linear(8, 2))

online_network.float()
target_network.float()
target_network.load_state_dict(online_network.state_dict())
target_network.eval() # not going to be trained

optimizer = optim.Adam(params=online_network.parameters(), lr=0.001)

agent = Agent()
batch_size = 128
gamma = 0.99
update_target_every = 1000

env = gym.make('CartPole-v0')

def plot():
  plt.plot(scores)
  plt.ylabel('Score')
  plt.show()
  
  plt.plot(losses)
  plt.ylabel('Loss')
  plt.show()

num_episodes = 2000
scores, losses, epses = [], [], []
step_count = 0
loss = torch.tensor(-100) # just a random number so I can print it even in the beginning
for ep in range(num_episodes):
  state = env.reset()
  done = False
  score = 0.0
  while not done:
    action = agent.choose_action(state, online_network)
    epses.append(agent.eps) # logging
    agent.decrease_eps(ep, num_episodes)
    next_state, reward, done, info = env.step(action)
    memory.store((state, action, np.float32(reward), next_state, done))
    state = next_state

    score += reward

    if memory.can_provide_sample(batch_size):
      if step_count % update_target_every == 0:
        target_network.load_state_dict(online_network.state_dict())
    
      states, actions, rewards, next_states, dones = memory.sample(batch_size)
      
      q_pred = online_network(states).max(dim=1)[0]
      q_next = target_network(next_states).max(dim=1)[0]
      q_next[dones] = 0.0
      
      q_target = rewards + gamma * q_next
      
      # learning
      loss = F.mse_loss(q_target, q_pred)
      losses.append(loss.item())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      
      step_count += 1
    
  # logging
  scores.append(score)
  avg_score = np.mean(scores[-100:])
  
  print(f'episode {ep}: score: {score}, avg score: {avg_score}, \
    eps: {agent.eps}, loss: {loss.item()}')

plot()

env.close()

0 ответов

Другие вопросы по тегам