Module pearl.replay_buffers.sequential_decision_making.hindsight_experience_replay_buffer
Expand source code
from typing import Callable, List, Optional, Tuple
from pearl.api.action import Action
from pearl.api.action_space import ActionSpace
from pearl.api.reward import Reward
from pearl.api.state import SubjectiveState
from pearl.replay_buffers.sequential_decision_making.fifo_off_policy_replay_buffer import (
    FIFOOffPolicyReplayBuffer,
)
from pearl.utils.tensor_like import assert_is_tensor_like
class HindsightExperienceReplayBuffer(FIFOOffPolicyReplayBuffer):
    """
    paper: https://arxiv.org/pdf/1707.01495.pdf
    final mode for alternative only for now
    TLDR:
    HindsightExperienceReplayBuffer is used for sparse reward problems.
    After an episode ends, apart from pushing original data in,
    it will replace original goal with final state in the episode,
    and replay the transitions again for new rewards and push
    capacity: size of the replay buffer
    goal_dim: dimension of goal of the problem.
              Subjective state input to `push` method will be the final state representation
              so we could need this info in order to split alternative goal after episode
              terminates.
    reward_fn: is the F here: F(state+goal, action) = reward
    done_fn: This is different from paper. Original paper doesn't have it.
             We need it for games which may end earlier.
             If this is not defined, then use done value from original trajectory.
    """
    # TODO: improve unclear docstring
    def __init__(
        self,
        capacity: int,
        goal_dim: int,
        reward_fn: Callable[[SubjectiveState, Action], Reward],
        done_fn: Optional[Callable[[SubjectiveState, Action], bool]] = None,
    ) -> None:
        super(HindsightExperienceReplayBuffer, self).__init__(capacity=capacity)
        self._goal_dim = goal_dim
        self._reward_fn = reward_fn
        self._done_fn = done_fn
        self._trajectory: List[
            Tuple[
                SubjectiveState,
                Action,
                SubjectiveState,
                ActionSpace,
                ActionSpace,
                bool,
                Optional[int],
                Optional[float],
            ]
        ] = []
    def push(
        self,
        state: SubjectiveState,
        action: Action,
        reward: Reward,
        next_state: SubjectiveState,
        curr_available_actions: ActionSpace,
        next_available_actions: ActionSpace,
        done: bool,
        max_number_actions: Optional[int] = None,
        cost: Optional[float] = None,
    ) -> None:
        next_state = assert_is_tensor_like(next_state)
        # assuming state and goal are all list, so we could use + to cat
        super(HindsightExperienceReplayBuffer, self).push(
            # input here already have state and goal cat together
            state,
            action,
            reward,
            next_state,
            curr_available_actions,
            next_available_actions,
            done,
            max_number_actions,
            cost,
        )
        self._trajectory.append(
            (
                state,
                action,
                next_state,
                curr_available_actions,
                next_available_actions,
                done,
                max_number_actions,
                cost,
            )
        )
        if done:
            additional_goal = next_state[: -self._goal_dim]  # final mode
            for (
                state,
                action,
                next_state,
                curr_available_actions,
                next_available_actions,
                done,
                max_number_actions,
                cost,
            ) in self._trajectory:
                # replace current_goal with additional_goal
                state = assert_is_tensor_like(state)
                next_state = assert_is_tensor_like(next_state)
                state[-self._goal_dim :] = additional_goal
                next_state[-self._goal_dim :] = additional_goal
                super(HindsightExperienceReplayBuffer, self).push(
                    state,
                    action,
                    self._reward_fn(state, action),
                    next_state,
                    curr_available_actions,
                    next_available_actions,
                    done if self._done_fn is None else self._done_fn(state, action),
                    max_number_actions,
                    cost,
                )
            self._trajectory = []Classes
- class HindsightExperienceReplayBuffer (capacity: int, goal_dim: int, reward_fn: Callable[[torch.Tensor, torch.Tensor], object], done_fn: Optional[Callable[[torch.Tensor, torch.Tensor], bool]] = None)
- 
paper: https://arxiv.org/pdf/1707.01495.pdf final mode for alternative only for now TLDR: HindsightExperienceReplayBuffer is used for sparse reward problems. After an episode ends, apart from pushing original data in, it will replace original goal with final state in the episode, and replay the transitions again for new rewards and push capacity: size of the replay buffer goal_dim: dimension of goal of the problem. Subjective state input to pushmethod will be the final state representation so we could need this info in order to split alternative goal after episode terminates. reward_fn: is the F here: F(state+goal, action) = reward done_fn: This is different from paper. Original paper doesn't have it. We need it for games which may end earlier. If this is not defined, then use done value from original trajectory.Expand source codeclass HindsightExperienceReplayBuffer(FIFOOffPolicyReplayBuffer): """ paper: https://arxiv.org/pdf/1707.01495.pdf final mode for alternative only for now TLDR: HindsightExperienceReplayBuffer is used for sparse reward problems. After an episode ends, apart from pushing original data in, it will replace original goal with final state in the episode, and replay the transitions again for new rewards and push capacity: size of the replay buffer goal_dim: dimension of goal of the problem. Subjective state input to `push` method will be the final state representation so we could need this info in order to split alternative goal after episode terminates. reward_fn: is the F here: F(state+goal, action) = reward done_fn: This is different from paper. Original paper doesn't have it. We need it for games which may end earlier. If this is not defined, then use done value from original trajectory. """ # TODO: improve unclear docstring def __init__( self, capacity: int, goal_dim: int, reward_fn: Callable[[SubjectiveState, Action], Reward], done_fn: Optional[Callable[[SubjectiveState, Action], bool]] = None, ) -> None: super(HindsightExperienceReplayBuffer, self).__init__(capacity=capacity) self._goal_dim = goal_dim self._reward_fn = reward_fn self._done_fn = done_fn self._trajectory: List[ Tuple[ SubjectiveState, Action, SubjectiveState, ActionSpace, ActionSpace, bool, Optional[int], Optional[float], ] ] = [] def push( self, state: SubjectiveState, action: Action, reward: Reward, next_state: SubjectiveState, curr_available_actions: ActionSpace, next_available_actions: ActionSpace, done: bool, max_number_actions: Optional[int] = None, cost: Optional[float] = None, ) -> None: next_state = assert_is_tensor_like(next_state) # assuming state and goal are all list, so we could use + to cat super(HindsightExperienceReplayBuffer, self).push( # input here already have state and goal cat together state, action, reward, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) self._trajectory.append( ( state, action, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) ) if done: additional_goal = next_state[: -self._goal_dim] # final mode for ( state, action, next_state, curr_available_actions, next_available_actions, done, max_number_actions, cost, ) in self._trajectory: # replace current_goal with additional_goal state = assert_is_tensor_like(state) next_state = assert_is_tensor_like(next_state) state[-self._goal_dim :] = additional_goal next_state[-self._goal_dim :] = additional_goal super(HindsightExperienceReplayBuffer, self).push( state, action, self._reward_fn(state, action), next_state, curr_available_actions, next_available_actions, done if self._done_fn is None else self._done_fn(state, action), max_number_actions, cost, ) self._trajectory = []AncestorsInherited members