Миграция проекта со stable_baselines на ray[rllib]
Я главный разработчик Jiminy, быстрого симулятора Python/C++ для многоартикулированных систем, совместимого со средой обучения openAI Gym. До сих пор я использовал stable_baselines для обучения с подкреплением, но хотел бы переключиться на ray[rllib], поскольку он кажется более эффективным и очень хорошо документирован.
Я пытаюсь перенести пример обучения Джимини с тележкой, предоставленный сgym-jiminy
в rllib. Я хочу использовать один и тот же алгоритм простоты (PPO) и модель (FFN) и те же параметры, когда это возможно. Однако он больше не сходится с использованием rllib, и я не понимаю, почему. Я пытался изменить различные параметры, но безуспешно...
Вот основной скрипт, который я использую:
import time
import gym
import ray
from ray import rllib
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.agents.trainer import COMMON_CONFIG
from tensorboard.program import TensorBoard
GYM_ENV_NAME = "gym_jiminy:jiminy-cartpole-v0"
# ================= Initialize the Ray backend =================
ray.init(
address=None, # The address of the Ray cluster to connect to, if any.
num_cpus=8, # Number of CPUs assigned to each raylet (None = no limit)
num_gpus=1, # Number of GPUs assigned to each raylet (None = no limit)
webui_host="0.0.0.0", # The host to bind the web UI server to.
local_mode=False, # If true, the code will be executed serially (for debugging purpose)
logging_level=20 # Logging level.
)
if not 'tb' in locals():
tb = TensorBoard()
tb.configure(host="0.0.0.0",
logdir=os.path.join(pathlib.Path.home(), 'ray_results'))
url = tb.launch()
print(f"Starting Tensorboard {url} ...")
# ================= Configure the model =================
# Copy the default model configuration
mdl_cfg = MODEL_DEFAULTS.copy()
# Convolution network settings
mdl_cfg["conv_filters"] = None # Filter config. List of [out_channels, kernel, stride] for each filter
mdl_cfg["conv_activation"] = "relu" # Nonlinearity for built-in convnet
# Fully-connected network settings
mdl_cfg["fcnet_activation"] = "tanh" # Nonlinearity for built-in fully connected net (tanh, relu)
mdl_cfg["fcnet_hiddens"] = [64, 64] # Number of hidden layers for fully connected net
mdl_cfg["no_final_linear"] = False # Whether to skip the final linear layer used to resize the outputs to `num_outputs`
mdl_cfg["free_log_std"] = True # The last half of the output layer does not dependent on the input
mdl_cfg["vf_share_layers"] = True # Whether layers should be shared for the value function.
# LTSM network settings
mdl_cfg["use_lstm"] = False # Whether to wrap the model with a LSTM
mdl_cfg["max_seq_len"] = 20 # Max seq len for training the LSTM
mdl_cfg["lstm_cell_size"] = 256 # Size of the LSTM cell
mdl_cfg["lstm_use_prev_action_reward"] = False # Whether to feed a_{t-1}, r_{t-1} to LSTM
# Custom model settings
mdl_cfg["custom_model"] = None # Name of a custom model to use
mdl_cfg["custom_options"] = {} # Dict of extra options to pass to the custom models
# ================= Configure rllib =================
# Copy the default rllib configuration
rllib_cfg = COMMON_CONFIG.copy()
# Ressources settings
rllib_cfg["use_pytorch"] = True # Use PyTorch instead of Tensorflow
rllib_cfg["num_gpus"] = 1 # Number of GPUs to reserve for the trainer process
rllib_cfg["num_workers"] = 8 # Number of rollout worker actors for parallel sampling
rllib_cfg["num_envs_per_worker"] = 16 # Number of environments per worker
rllib_cfg["num_cpus_per_worker"] = 1 # Number of CPUs to reserve per worker
rllib_cfg["num_cpus_for_driver"] = 0 # Number of CPUs to allocate for the trainer
# Rollout settings
rllib_cfg["rollout_fragment_length"] = 32 # Sample batches of this size (mult. by `num_envs_per_worker`) are collected from rollout workers
rllib_cfg["train_batch_size"] = 512 # Sample batches are concatenated together into batches of this size
rllib_cfg["batch_mode"] = "complete_episodes" # Whether to rollout "complete_episodes" or "truncate_episodes" to `rollout_fragment_length`
rllib_cfg["sample_async"] = False # Use a background thread for sampling (slightly off-policy)
rllib_cfg["observation_filter"] = "NoFilter" # Element-wise observation filter ["NoFilter", "MeanStdFilter"]
rllib_cfg["metrics_smoothing_episodes"] = 100 # Smooth metrics over this many episodes
rllib_cfg["seed"] = None # sets the random seed of each worker (in conjunction with worker_index)
# Environment settings
rllib_cfg["horizon"] = None # Number of steps after which the episode is forced to terminate
rllib_cfg["soft_horizon"] = True # Calculate rewards but don't reset the environment when the horizon is hit
rllib_cfg["no_done_at_end"] = True # Don't set 'done' at the end of the episode
rllib_cfg["env_config"] = {} # Arguments to pass to the env creator
rllib_cfg["normalize_actions"] = False # Normalize actions to the upper and lower bounds of the action space
rllib_cfg["clip_actions"] = False # Whether to clip actions to the upper and lower bounds of the action space
# Learning settings
rllib_cfg["gamma"] = 0.99 # Discount factor of the MDP
rllib_cfg["lr"] = 1.0e-3 # Learning rate
rllib_cfg["shuffle_buffer_size"] = 0 # Shuffle input batches via a sliding window buffer of this size (0 = disable)
rllib_cfg["log_level"] = "WARN" # Set the ray.rllib.* log level for the agent process and its workers [DEBUG, INFO, WARN, or ERROR]
rllib_cfg["model"] = mdl_cfg # Policy model configuration
# ================= Configure the learning algorithm =================
# Select PPO algorithm
from ray.rllib.agents.ppo import PPOTrainer as Trainer, DEFAULT_CONFIG
# Copy the default learning algorithm configuration, including PPO-specific parameters,
# then overwrite the common parameters that has been updated ONLY.
agent_cfg = DEFAULT_CONFIG.copy()
for key, value in rllib_cfg.items():
if COMMON_CONFIG[key] != value:
agent_cfg[key] = value
# Optimizer settings
agent_cfg["sgd_minibatch_size"] = 128 # Total SGD batch size across all devices for SGD. This defines the minibatch size of each SGD epoch
agent_cfg["num_sgd_iter"] = 4 # Number of SGD epochs to execute per train batch
agent_cfg["shuffle_sequences"] = True # Whether to shuffle sequences in the batch when training
# Estimators settings
agent_cfg["use_gae"] = True # Use the Generalized Advantage Estimator (GAE) with a value function (https://arxiv.org/pdf/1506.02438.pdf)
agent_cfg["use_critic"] = False # Use a critic as a value baseline (otherwise don't use any; required for using GAE).
agent_cfg["lambda"] = 0.95 # The GAE(lambda) parameter.
# Learning and optimization settings
agent_cfg["lr_schedule"] = None # Learning rate schedule
agent_cfg["kl_coeff"] = 0.2 # Initial coefficient for KL divergence
agent_cfg["kl_target"] = 0.01 # Target value for KL divergence
agent_cfg["vf_share_layers"] = False # Share layers for value function. If you set this to True, it's important to tune vf_loss_coeff
agent_cfg["vf_loss_coeff"] = 0.5 # Coefficient of the value function loss
agent_cfg["entropy_coeff"] = 0.01 # Coefficient of the entropy regularizer
agent_cfg["entropy_coeff_schedule"] = None # Decay schedule for the entropy regularizer
agent_cfg["clip_param"] = 0.2 # PPO clip parameter
agent_cfg["vf_clip_param"] = float("inf") # Clip param for the value function. Note that this is sensitive to the scale of the rewards (-1 to disable)
agent_cfg["grad_clip"] = None # Clip the global norm of gradients by this amount (None = disable) (No working with PyTorch ML backend)
# ================= Configure the learning algorithm =================
train_agent = Trainer(agent_cfg, GYM_ENV_NAME)
# ================= Run the optimization =================
timesteps_total = 500000
results_fields_filter = ["training_iteration", "time_total_s", "timesteps_total", "episode_reward_max", "episode_reward_mean",
["info", ["sample_time_ms", "grad_time_ms", "opt_peak_throughput", "sample_peak_throughput"]]]
result = {"timesteps_total": 0}
while result["timesteps_total"] < timesteps_total:
# Perform one iteration of training the policy
result = train_agent.train()
# Print the training status
for field in results_fields_filter:
if not isinstance(field, list):
if field in result.keys():
print(f"{field}: {result[field]}")
else:
for subfield in field[1]:
if subfield in result[field[0]].keys():
print(f"{subfield} : {result[field[0]][subfield]}")
print("============================")
checkpoint_path = train_agent.save()
# ================= Enjoy a trained agent =================
t_end = 10.0 # Total duration of the simulation(s) in seconds
env = gym.make(GYM_ENV_NAME)
test_agent = Trainer(agent_cfg, GYM_ENV_NAME)
test_agent.restore(checkpoint_path)
t_init = time.time()
t_prev = t_init
while t_prev - t_init < 20.0:
observ = env.reset()
done = False
cumulative_reward = 0
while (not done) and (t_prev - t_init < 20.0):
action = test_agent.compute_action(observ, explore=False)
observ, reward, done, _ = env.step(action)
cumulative_reward += reward
env.render()
sleep(env.dt - (time.time() - t_prev))
t_prev = time.time()
print(cumulative_reward)
# ================= Terminate the Ray backend =================
ray.shutdown()
Что-то явно не так в том, что я делаю? Кто-нибудь знаком с обоими модулями, чтобы иметь представление о различиях между ними?
PS: можно запустить этот скрипт после установки последней версии Ray Wheel иgym-jiminy
с помощью пункта.