Module pearl.utils.scripts.benchmark_offline_rl
Expand source code
#!/usr/bin/env fbpython
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
import os
import pickle
from typing import List, Optional
import torch
from pearl.api.agent import Agent
from pearl.api.environment import Environment
from pearl.neural_networks.sequential_decision_making.actor_networks import (
GaussianActorNetwork,
VanillaContinuousActorNetwork,
)
from pearl.pearl_agent import PearlAgent
from pearl.policy_learners.sequential_decision_making.implicit_q_learning import (
ImplicitQLearning,
)
from pearl.policy_learners.sequential_decision_making.soft_actor_critic_continuous import (
ContinuousSoftActorCritic,
)
from pearl.replay_buffers.sequential_decision_making.fifo_off_policy_replay_buffer import (
FIFOOffPolicyReplayBuffer,
)
from pearl.utils.functional_utils.experimentation.create_offline_data import (
create_offline_data,
get_data_collection_agent_returns,
)
from pearl.utils.functional_utils.train_and_eval.offline_learning_and_evaluation import (
get_offline_data_in_buffer,
offline_evaluation,
offline_learning,
)
from pearl.utils.functional_utils.train_and_eval.online_learning import run_episode
from pearl.utils.instantiations.environments.gym_environment import GymEnvironment
def get_random_agent_returns(
agent: Agent,
env: Environment,
save_path: Optional[str],
file_path: Optional[str] = None,
learn: bool = False,
learn_after_episode: bool = False,
evaluation_episodes: int = 500,
seed: Optional[int] = None,
) -> List[float]:
"""
This function returns a list of episode returns of a Pearl Agent interacting with the input
environment using a randomly instantiated policy learner. This is needed to compute
the baseline for calculating normalized scores for offline rl benchmarks.
Args:
agent: a pearl agent with a randomly initiated policy learner.
env: an environment to collect data from (e.g. GymEnvironment)
learn: should be set to False.
exploit: should be set to True.
learn_after_episode: should be set to False.
"""
# evaluation results of a random agent (with no learning)
print("getting returns of a random agent")
# check for a pickle file with episodic returns saved in the file path
if file_path is not None:
if os.path.isfile(file_path):
print(f"loading returns from file {file_path}")
with open(file_path, "rb") as file:
random_agent_returns = pickle.load(file)
else:
raise FileNotFoundError(f"No file found at {file_path}")
else:
print(
"no returns file path provided; proceeding to collect data from environment directly"
)
random_agent_returns = []
for i in range(evaluation_episodes):
evaluation_seed = seed + i if seed is not None else None
info, _ = run_episode(
agent=agent,
env=env,
learn=False,
exploit=True,
learn_after_episode=False,
seed=evaluation_seed,
)
g = info["return"]
print(f"episode {i}, return={g}")
random_agent_returns.append(g)
with open(
save_path + "returns_random_agent" + ".pickle",
"wb",
) as handle:
pickle.dump(
random_agent_returns, handle, protocol=pickle.HIGHEST_PROTOCOL
)
return random_agent_returns
# Can be generalized to different environment types; written for gym tasks for now
def evaluate_offline_rl(
env: GymEnvironment,
is_action_continuous: bool,
offline_agent: PearlAgent,
method_name: str,
training_epochs: int = 1000,
evaluation_episodes: int = 500,
url: Optional[str] = None,
data_path: Optional[str] = None,
data_collection_agent: Optional[PearlAgent] = None,
file_name: Optional[str] = None,
data_save_path: Optional[str] = "../fbsource/fbcode/pearl/offline_rl_data/",
data_size: int = 1000000,
seed: Optional[int] = None,
) -> List[float]:
"""
This function trains and evaluates an offline RL agent on the given environment. Training data
can be provided through a url or by specifying a local file path. If neither are provided,
then a 'data collection agent' must be provided. The data collection agent will be used to
collect data from the environment and save it to a file.
Args:
env: an environment to (optionally) collect data from (e.g. GymEnvironment) and evalue the
offline agent on.
is_action_continuous: used when translating the offline data to a replay buffer format
compatible with a Pearl Agent (see class TensorBasedReplayBuffer for details).
offline_agent: an offline agent to train and evaluate (for example, IQL or CQL based agent).
method_name: name of the agent's policy learner (used for saving results).
training_epochs: number of epochs to train the offline agent for.
evaluation_episodes: number of episodes to evaluate the offline agent for.
url: url to download data from.
data_path: path to a local file containing offline data to use for training.
data_collection_agent: a Pearl Agent used to collect offline data in case url or data_path
are not provided.
file_name: name of the file to store the offline data in.
data_save_path: path to a directory where the offline data will be stored.
data_size: size of the offline data (replay buffer) to be used for training.
"""
if url is None and data_path is None and data_collection_agent is None:
raise ValueError(
"Must provide either a URL, a path, or an agent to collect data."
)
# to save the offline evaluation results and/or offline data collected (if no url or data path
# is provided)
os.makedirs(data_save_path, exist_ok=True)
if url is not None or data_path is not None:
if url is not None:
print("downloading data from the given url")
else:
if os.path.isfile(data_path):
print("reading data from the given path")
else:
raise FileNotFoundError(f"No file found at {data_path}")
offline_data_replay_buffer = get_offline_data_in_buffer(
is_action_continuous, url, data_path, size=data_size
)
else:
if file_name is None:
raise ValueError("Must provide a name of file to store data.")
file_name = file_name
create_offline_data(
agent=data_collection_agent,
env=env,
save_path=data_save_path,
file_name=file_name,
max_len_offline_data=data_size,
learn=True,
learn_after_episode=False,
seed=seed,
)
print("\n")
print("collected data; starting import in replay buffer")
data_path = data_save_path + file_name
offline_data_replay_buffer = get_offline_data_in_buffer(
is_action_continuous, url, data_path, size=data_size
)
print("offline data in replay buffer; start offline training")
offline_learning(
offline_agent=offline_agent,
data_buffer=offline_data_replay_buffer,
training_epochs=training_epochs,
seed=seed,
)
print("\n")
print("offline training done; start offline evaluation")
offline_evaluation_returns = offline_evaluation(
offline_agent=offline_agent,
env=env,
number_of_episodes=evaluation_episodes,
seed=seed,
)
# save the offline evaluation results
dir_name = (
data_save_path
+ method_name
+ "/"
+ offline_agent.policy_learner._actor.__class__.__name__
+ "/"
)
os.makedirs(dir_name, exist_ok=True)
with open(
dir_name
+ "returns_offline_agent_"
+ dataset
+ "_"
+ str(training_epochs)
+ ".pickle",
"wb",
) as handle:
pickle.dump(
offline_evaluation_returns, handle, protocol=pickle.HIGHEST_PROTOCOL
)
return offline_evaluation_returns
if __name__ == "__main__":
device_id = 1
experiment_seed = 100
env_name = "HalfCheetah-v4"
env = GymEnvironment(env_name)
action_space = env.action_space
is_action_continuous = True
# actor_network_type = GaussianActorNetwork
actor_network_type = VanillaContinuousActorNetwork
offline_agent = PearlAgent(
policy_learner=ImplicitQLearning(
state_dim=env.observation_space.shape[0], # pyre-ignore
action_space=action_space,
actor_hidden_dims=[256, 256],
critic_hidden_dims=[256, 256],
value_critic_hidden_dims=[256, 256],
actor_network_type=actor_network_type,
value_critic_learning_rate=1e-4,
actor_learning_rate=3e-4,
critic_learning_rate=1e-4,
critic_soft_update_tau=0.05,
training_rounds=2,
batch_size=256,
expectile=0.75,
temperature_advantage_weighted_regression=3,
),
device_id=device_id,
)
data_collection_agent = PearlAgent(
policy_learner=ContinuousSoftActorCritic(
state_dim=env.observation_space.shape[0],
action_space=env.action_space,
actor_hidden_dims=[256, 256],
critic_hidden_dims=[256, 256],
training_rounds=1,
batch_size=256,
entropy_coef=0.25,
entropy_autotune=False,
actor_learning_rate=0.0003,
critic_learning_rate=0.0005,
),
replay_buffer=FIFOOffPolicyReplayBuffer(1000000),
device_id=device_id,
)
data_save_path = "../fbsource/fbcode/pearl/offline_rl_data/" + env_name + "/"
# dataset = "small_2"
# dataset = "medium"
# this is only for end to end testing check
# for benchmarking, using the "small_2", "medium" or "large" datasets
dataset = "small"
file_name = "offline_raw_transitions_dict_" + dataset + ".pt"
print(" ")
print(
f"actor network type: {offline_agent.policy_learner._actor.__class__.__name__}"
)
print(f"data set name: {file_name}")
print(
f"critic update parameter: {offline_agent.policy_learner._critic_soft_update_tau}"
)
print(" ")
data_path = data_save_path + file_name
# remember to specify a data collection agent or a path to offline data
offline_agent_returns = evaluate_offline_rl(
env=env,
is_action_continuous=is_action_continuous,
offline_agent=offline_agent,
method_name="Implicit Q learning",
training_epochs=10000,
# data_path=data_path,
data_collection_agent=data_collection_agent,
file_name=file_name,
data_save_path=data_save_path,
data_size=100000,
evaluation_episodes=100,
seed=experiment_seed,
)
avg_offline_agent_returns = torch.mean(torch.tensor(offline_agent_returns))
print()
print(f"average returns of the offline agent {avg_offline_agent_returns}")
print("\n")
# getting the returns of a random agent
random_returns_file_path = data_save_path + "returns_random_agent.pickle"
random_agent_returns = get_random_agent_returns(
agent=data_collection_agent,
env=env,
save_path=data_save_path,
file_path=random_returns_file_path,
)
avg_return_random_agent = torch.mean(torch.tensor(random_agent_returns))
print(f"average returns of a random agent {avg_return_random_agent}")
print("\n")
data_collection_agent_returns = get_data_collection_agent_returns(
data_path=data_path
)
avg_return_data_collection_agent = torch.mean(
torch.tensor(data_collection_agent_returns)
)
print(
f"average returns of the data collection agent {avg_return_data_collection_agent}"
)
print(" ")
normalized_score = (avg_offline_agent_returns - avg_return_random_agent) / (
avg_return_data_collection_agent - avg_return_random_agent
)
# ideally, we would want normalized score to be greater than 1 (indicating the agent has
# learned something better than the data collection agent) but this is not always the case
print(f"normalized score {normalized_score}")
if normalized_score < 0.25:
print(
"Offline agent does not seems to be learning well. Check the "
"hyperparameters in IQL_offline_method in benchmark_config.py file and run with the dataset_name = `small_2`."
)
Functions
def evaluate_offline_rl(env: GymEnvironment, is_action_continuous: bool, offline_agent: PearlAgent, method_name: str, training_epochs: int = 1000, evaluation_episodes: int = 500, url: Optional[str] = None, data_path: Optional[str] = None, data_collection_agent: Optional[PearlAgent] = None, file_name: Optional[str] = None, data_save_path: Optional[str] = '../fbsource/fbcode/pearl/offline_rl_data/', data_size: int = 1000000, seed: Optional[int] = None) ‑> List[float]
-
This function trains and evaluates an offline RL agent on the given environment. Training data can be provided through a url or by specifying a local file path. If neither are provided, then a 'data collection agent' must be provided. The data collection agent will be used to collect data from the environment and save it to a file.
Args
env
- an environment to (optionally) collect data from (e.g. GymEnvironment) and evalue the offline agent on.
is_action_continuous
- used when translating the offline data to a replay buffer format compatible with a Pearl Agent (see class TensorBasedReplayBuffer for details).
offline_agent
- an offline agent to train and evaluate (for example, IQL or CQL based agent).
method_name
- name of the agent's policy learner (used for saving results).
training_epochs
- number of epochs to train the offline agent for.
evaluation_episodes
- number of episodes to evaluate the offline agent for.
url
- url to download data from.
data_path
- path to a local file containing offline data to use for training.
data_collection_agent
- a Pearl Agent used to collect offline data in case url or data_path are not provided.
file_name
- name of the file to store the offline data in.
data_save_path
- path to a directory where the offline data will be stored.
data_size
- size of the offline data (replay buffer) to be used for training.
Expand source code
def evaluate_offline_rl( env: GymEnvironment, is_action_continuous: bool, offline_agent: PearlAgent, method_name: str, training_epochs: int = 1000, evaluation_episodes: int = 500, url: Optional[str] = None, data_path: Optional[str] = None, data_collection_agent: Optional[PearlAgent] = None, file_name: Optional[str] = None, data_save_path: Optional[str] = "../fbsource/fbcode/pearl/offline_rl_data/", data_size: int = 1000000, seed: Optional[int] = None, ) -> List[float]: """ This function trains and evaluates an offline RL agent on the given environment. Training data can be provided through a url or by specifying a local file path. If neither are provided, then a 'data collection agent' must be provided. The data collection agent will be used to collect data from the environment and save it to a file. Args: env: an environment to (optionally) collect data from (e.g. GymEnvironment) and evalue the offline agent on. is_action_continuous: used when translating the offline data to a replay buffer format compatible with a Pearl Agent (see class TensorBasedReplayBuffer for details). offline_agent: an offline agent to train and evaluate (for example, IQL or CQL based agent). method_name: name of the agent's policy learner (used for saving results). training_epochs: number of epochs to train the offline agent for. evaluation_episodes: number of episodes to evaluate the offline agent for. url: url to download data from. data_path: path to a local file containing offline data to use for training. data_collection_agent: a Pearl Agent used to collect offline data in case url or data_path are not provided. file_name: name of the file to store the offline data in. data_save_path: path to a directory where the offline data will be stored. data_size: size of the offline data (replay buffer) to be used for training. """ if url is None and data_path is None and data_collection_agent is None: raise ValueError( "Must provide either a URL, a path, or an agent to collect data." ) # to save the offline evaluation results and/or offline data collected (if no url or data path # is provided) os.makedirs(data_save_path, exist_ok=True) if url is not None or data_path is not None: if url is not None: print("downloading data from the given url") else: if os.path.isfile(data_path): print("reading data from the given path") else: raise FileNotFoundError(f"No file found at {data_path}") offline_data_replay_buffer = get_offline_data_in_buffer( is_action_continuous, url, data_path, size=data_size ) else: if file_name is None: raise ValueError("Must provide a name of file to store data.") file_name = file_name create_offline_data( agent=data_collection_agent, env=env, save_path=data_save_path, file_name=file_name, max_len_offline_data=data_size, learn=True, learn_after_episode=False, seed=seed, ) print("\n") print("collected data; starting import in replay buffer") data_path = data_save_path + file_name offline_data_replay_buffer = get_offline_data_in_buffer( is_action_continuous, url, data_path, size=data_size ) print("offline data in replay buffer; start offline training") offline_learning( offline_agent=offline_agent, data_buffer=offline_data_replay_buffer, training_epochs=training_epochs, seed=seed, ) print("\n") print("offline training done; start offline evaluation") offline_evaluation_returns = offline_evaluation( offline_agent=offline_agent, env=env, number_of_episodes=evaluation_episodes, seed=seed, ) # save the offline evaluation results dir_name = ( data_save_path + method_name + "/" + offline_agent.policy_learner._actor.__class__.__name__ + "/" ) os.makedirs(dir_name, exist_ok=True) with open( dir_name + "returns_offline_agent_" + dataset + "_" + str(training_epochs) + ".pickle", "wb", ) as handle: pickle.dump( offline_evaluation_returns, handle, protocol=pickle.HIGHEST_PROTOCOL ) return offline_evaluation_returns
def get_random_agent_returns(agent: Agent, env: Environment, save_path: Optional[str], file_path: Optional[str] = None, learn: bool = False, learn_after_episode: bool = False, evaluation_episodes: int = 500, seed: Optional[int] = None) ‑> List[float]
-
This function returns a list of episode returns of a Pearl Agent interacting with the input environment using a randomly instantiated policy learner. This is needed to compute the baseline for calculating normalized scores for offline rl benchmarks.
Args
agent
- a pearl agent with a randomly initiated policy learner.
env
- an environment to collect data from (e.g. GymEnvironment)
learn
- should be set to False.
exploit
- should be set to True.
learn_after_episode
- should be set to False.
Expand source code
def get_random_agent_returns( agent: Agent, env: Environment, save_path: Optional[str], file_path: Optional[str] = None, learn: bool = False, learn_after_episode: bool = False, evaluation_episodes: int = 500, seed: Optional[int] = None, ) -> List[float]: """ This function returns a list of episode returns of a Pearl Agent interacting with the input environment using a randomly instantiated policy learner. This is needed to compute the baseline for calculating normalized scores for offline rl benchmarks. Args: agent: a pearl agent with a randomly initiated policy learner. env: an environment to collect data from (e.g. GymEnvironment) learn: should be set to False. exploit: should be set to True. learn_after_episode: should be set to False. """ # evaluation results of a random agent (with no learning) print("getting returns of a random agent") # check for a pickle file with episodic returns saved in the file path if file_path is not None: if os.path.isfile(file_path): print(f"loading returns from file {file_path}") with open(file_path, "rb") as file: random_agent_returns = pickle.load(file) else: raise FileNotFoundError(f"No file found at {file_path}") else: print( "no returns file path provided; proceeding to collect data from environment directly" ) random_agent_returns = [] for i in range(evaluation_episodes): evaluation_seed = seed + i if seed is not None else None info, _ = run_episode( agent=agent, env=env, learn=False, exploit=True, learn_after_episode=False, seed=evaluation_seed, ) g = info["return"] print(f"episode {i}, return={g}") random_agent_returns.append(g) with open( save_path + "returns_random_agent" + ".pickle", "wb", ) as handle: pickle.dump( random_agent_returns, handle, protocol=pickle.HIGHEST_PROTOCOL ) return random_agent_returns