Module pearl.replay_buffers.sequential_decision_making.hindsight_experience_replay_buffer
Expand source code
from typing import Callable, List, Optional, Tuple
from pearl.api.action import Action
from pearl.api.action_space import ActionSpace
from pearl.api.reward import Reward
from pearl.api.state import SubjectiveState
from pearl.replay_buffers.sequential_decision_making.fifo_off_policy_replay_buffer import (
FIFOOffPolicyReplayBuffer,
)
from pearl.utils.tensor_like import assert_is_tensor_like
class HindsightExperienceReplayBuffer(FIFOOffPolicyReplayBuffer):
"""
paper: https://arxiv.org/pdf/1707.01495.pdf
final mode for alternative only for now
TLDR:
HindsightExperienceReplayBuffer is used for sparse reward problems.
After an episode ends, apart from pushing original data in,
it will replace original goal with final state in the episode,
and replay the transitions again for new rewards and push
capacity: size of the replay buffer
goal_dim: dimension of goal of the problem.
Subjective state input to `push` method will be the final state representation
so we could need this info in order to split alternative goal after episode
terminates.
reward_fn: is the F here: F(state+goal, action) = reward
done_fn: This is different from paper. Original paper doesn't have it.
We need it for games which may end earlier.
If this is not defined, then use done value from original trajectory.
"""
# TODO: improve unclear docstring
def __init__(
self,
capacity: int,
goal_dim: int,
reward_fn: Callable[[SubjectiveState, Action], Reward],
done_fn: Optional[Callable[[SubjectiveState, Action], bool]] = None,
) -> None:
super(HindsightExperienceReplayBuffer, self).__init__(capacity=capacity)
self._goal_dim = goal_dim
self._reward_fn = reward_fn
self._done_fn = done_fn
self._trajectory: List[
Tuple[
SubjectiveState,
Action,
SubjectiveState,
ActionSpace,
ActionSpace,
bool,
Optional[int],
Optional[float],
]
] = []
def push(
self,
state: SubjectiveState,
action: Action,
reward: Reward,
next_state: SubjectiveState,
curr_available_actions: ActionSpace,
next_available_actions: ActionSpace,
done: bool,
max_number_actions: Optional[int] = None,
cost: Optional[float] = None,
) -> None:
next_state = assert_is_tensor_like(next_state)
# assuming state and goal are all list, so we could use + to cat
super(HindsightExperienceReplayBuffer, self).push(
# input here already have state and goal cat together
state,
action,
reward,
next_state,
curr_available_actions,
next_available_actions,
done,
max_number_actions,
cost,
)
self._trajectory.append(
(
state,
action,
next_state,
curr_available_actions,
next_available_actions,
done,
max_number_actions,
cost,
)
)
if done:
additional_goal = next_state[: -self._goal_dim] # final mode
for (
state,
action,
next_state,
curr_available_actions,
next_available_actions,
done,
max_number_actions,
cost,
) in self._trajectory:
# replace current_goal with additional_goal
state = assert_is_tensor_like(state)
next_state = assert_is_tensor_like(next_state)
state[-self._goal_dim :] = additional_goal
next_state[-self._goal_dim :] = additional_goal
super(HindsightExperienceReplayBuffer, self).push(
state,
action,
self._reward_fn(state, action),
next_state,
curr_available_actions,
next_available_actions,
done if self._done_fn is None else self._done_fn(state, action),
max_number_actions,
cost,
)
self._trajectory = []
Classes
class HindsightExperienceReplayBuffer (capacity: int, goal_dim: int, reward_fn: Callable[[torch.Tensor, torch.Tensor], object], done_fn: Optional[Callable[[torch.Tensor, torch.Tensor], bool]] = None)
-
paper: https://arxiv.org/pdf/1707.01495.pdf final mode for alternative only for now
TLDR: HindsightExperienceReplayBuffer is used for sparse reward problems. After an episode ends, apart from pushing original data in, it will replace original goal with final state in the episode, and replay the transitions again for new rewards and push
capacity: size of the replay buffer goal_dim: dimension of goal of the problem. Subjective state input to
push
method will be the final state representation so we could need this info in order to split alternative goal after episode terminates. reward_fn: is the F here: F(state+goal, action) = reward done_fn: This is different from paper. Original paper doesn't have it. We need it for games which may end earlier. If this is not defined, then use done value from original trajectory.Expand source code
class HindsightExperienceReplayBuffer(FIFOOffPolicyReplayBuffer): """ paper: https://arxiv.org/pdf/1707.01495.pdf final mode for alternative only for now TLDR: HindsightExperienceReplayBuffer is used for sparse reward problems. After an episode ends, apart from pushing original data in, it will replace original goal with final state in the episode, and replay the transitions again for new rewards and push capacity: size of the replay buffer goal_dim: dimension of goal of the problem. Subjective state input to `push` method will be the final state representation so we could need this info in order to split alternative goal after episode terminates. reward_fn: is the F here: F(state+goal, action) = reward done_fn: This is different from paper. Original paper doesn't have it. We need it for games which may end earlier. If this is not defined, then use done value from original trajectory. """ # TODO: improve unclear docstring def __init__( self, capacity: int, goal_dim: int, reward_fn: Callable[[SubjectiveState, Action], Reward], done_fn: Optional[Callable[[SubjectiveState, Action], bool]] = None, ) -> None: super(HindsightExperienceReplayBuffer, self).__init__(capacity=capacity) self._goal_dim = goal_dim self._reward_fn = reward_fn self._done_fn = done_fn self._trajectory: List[ Tuple[ SubjectiveState, Action, SubjectiveState, ActionSpace, ActionSpace, bool, Optional[int], Optional[float], ] ] = [] def push( self, state: SubjectiveState, action: Action, reward: Reward, next_state: SubjectiveState, curr_available_actions: ActionSpace, next_available_actions: ActionSpace, done: bool, max_number_actions: Optional[int] = None, cost: Optional[float] = None, ) -> None: next_state = assert_is_tensor_like(next_state) # assuming state and goal are all list, so we could use + to cat super(HindsightExperienceReplayBuffer, self).push( # input here already have state and goal cat together state, action, reward, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) self._trajectory.append( ( state, action, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) ) if done: additional_goal = next_state[: -self._goal_dim] # final mode for ( state, action, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) in self._trajectory: # replace current_goal with additional_goal state = assert_is_tensor_like(state) next_state = assert_is_tensor_like(next_state) state[-self._goal_dim :] = additional_goal next_state[-self._goal_dim :] = additional_goal super(HindsightExperienceReplayBuffer, self).push( state, action, self._reward_fn(state, action), next_state, curr_available_actions, next_available_actions, done if self._done_fn is None else self._done_fn(state, action), max_number_actions, cost, ) self._trajectory = []
Ancestors
Inherited members