Module `pearl.utils.instantiations.environments.contextual_bandit_linear_synthetic_environment`

Expand source code

from typing import List, Optional, Tuple, Union

import pandas as pd
import torch

from pearl.api.action import Action
from pearl.api.action_space import ActionSpace

from pearl.api.observation import Observation
from pearl.api.reward import Value
from pearl.utils.instantiations.environments.contextual_bandit_environment import (
    ContextualBanditEnvironment,
)
from pearl.utils.instantiations.spaces.discrete_action import DiscreteActionSpace


class ContextualBanditLinearSyntheticEnvironment(ContextualBanditEnvironment):
    """
    A Contextual Bandit synthetic environment where the reward is linearly mapped from the
    context feature representation.
    The purpose of this environment is to simulate the behavior of a Contextual Bandit
    where rewards are modeled linearly from the context features.

    Following

    Lihong Li, Wei Chu, John Langford, Robert E. Schapire (2010),
    "A Contextual-Bandit Approach to Personalized News Article Recommendation,"

    The context for an arm is the concatenation of the observation feature vector
    and the arm feature vevctor.
    """

    def __init__(
        self,
        action_space: ActionSpace,
        observation_dim: int = 0,
        arm_feature_vector_dim: int = 4,
        reward_noise_sigma: float = 0.0,
        simple_linear_mapping: bool = False,
    ) -> None:
        """
        Args:
            action_space (ActionSpace): the environment's action space
            observation_dim (int): the number of dimensions in the observation feature vector
            arm_feature_vector_dim (int): the number of dimensions in the feature representation
                                          of arms
            reward_noise_sigma (float): the standard deviation of the noise added to the reward
            simple_linear_mapping (bool): if True, reward is simply the sum of the arm features
                                          (debugging purposes)
        """
        assert isinstance(action_space, DiscreteActionSpace)
        self._action_space: DiscreteActionSpace = action_space
        self.observation_dim = observation_dim
        self._arm_feature_vector_dim = arm_feature_vector_dim
        self.reward_noise_sigma = reward_noise_sigma
        self._simple_linear_mapping = simple_linear_mapping

        self._features_of_all_arms: torch.Tensor = self._generate_features_of_all_arms()
        self._linear_mapping: torch.nn.Module = self._make_initial_linear_mapping()
        self._observation: Optional[torch.Tensor] = None

    @property
    def action_space(self) -> ActionSpace:
        return self._action_space

    @property
    def arm_feature_vector_dim(self) -> int:
        return self._arm_feature_vector_dim

    @property
    def features_of_all_arms(self) -> torch.Tensor:
        return self._features_of_all_arms

    @property
    def linear_mapping(self) -> torch.nn.Module:
        return self._linear_mapping

    def _generate_features_of_all_arms(self) -> torch.Tensor:
        features_of_all_arms = torch.rand(
            self._action_space.n,
            self.arm_feature_vector_dim,
        )  # features of each arm. (num of action, num of features)
        return features_of_all_arms

    def _make_initial_linear_mapping(
        self,
    ) -> torch.nn.Module:
        """
        The function that maps context to reward (always linear).
        The input dimension (in_features) is observation dimension + arm feature vector dimension.
        The output (reward) is a scalar, so output dimension (out_features) is 1.
        """
        return torch.nn.Linear(
            in_features=self.observation_dim + self.arm_feature_vector_dim,
            out_features=1,
        )

    def reset(self, seed: Optional[int] = None) -> Tuple[Observation, ActionSpace]:
        """
        Provides the observation and action space to the agent.
        """
        self._observation = torch.rand(self.observation_dim)
        return self._observation, self.action_space

    def get_reward(self, action: Action) -> Value:
        """
        Given action, environment will return the reward associated of this action
        """
        # TODO: This assumes the action is an int tensor.
        context = self._get_context_for_arm(int(action.item()))
        reward = self._compute_reward_from_context(context)
        return reward

    def get_regret(self, action: Action) -> Value:
        """
        Given action, environment will return regret for choosing this action
        regret == max(reward over all action) - reward for current action
        """
        rewards = [
            self._compute_reward_from_context(self._get_context_for_arm(i))
            for i in range(self._action_space.n)
        ]
        # pyre-fixme[6]: For 1st argument expected
        #  `Iterable[Variable[SupportsRichComparisonT (bound to
        #  Union[SupportsDunderGT[typing.Any], SupportsDunderLT[typing.Any]])]]` but
        #  got `List[Tensor]`.
        # Requires greater cleanup
        return max(rewards) - rewards[action]

    def _get_context_for_arm(self, action: int) -> torch.Tensor:
        assert action in range(self._action_space.n)  # action is index in action_space
        assert self._observation is not None
        return torch.cat([self._observation, self.features_of_all_arms[action]])

    def _compute_reward_from_context(
        self,
        context: torch.Tensor,
    ) -> torch.Tensor:
        if self._simple_linear_mapping:
            reward = torch.sum(context).unsqueeze(dim=0)
            # assume the linear relationship between context and reward :
            # r_k = ONES @ g(f_k). This is convenient for debugging algorithms
            # when the algorithms are being developed.
        else:
            reward = self._compute_reward_from_context_using_linear_mapping(context)
        return reward

    def _compute_reward_from_context_using_linear_mapping(
        self,
        context: torch.Tensor,
    ) -> torch.Tensor:
        """
        We assume there is linear relationship between context and reward : r_k = g(f_k)
        The g() is a linear function mapping context to reward
        The output is reward, a Value.
        """
        # map features to rewards through a linear function W

        reward = self.linear_mapping(context)

        if self.reward_noise_sigma > 0.0:
            # add some Gaussian noise to each reward
            noise = torch.randn_like(reward) * self.reward_noise_sigma
            noisy_reward = reward + noise
            return noisy_reward
        else:
            return reward

    def __str__(self) -> str:
        return "Bandit with reward linearly mapped from context feature vector"

Classes

class ContextualBanditLinearSyntheticEnvironment (action_space: ActionSpace, observation_dim: int = 0, arm_feature_vector_dim: int = 4, reward_noise_sigma: float = 0.0, simple_linear_mapping: bool = False)

A Contextual Bandit synthetic environment where the reward is linearly mapped from the context feature representation. The purpose of this environment is to simulate the behavior of a Contextual Bandit where rewards are modeled linearly from the context features.

Following

Lihong Li, Wei Chu, John Langford, Robert E. Schapire (2010), "A Contextual-Bandit Approach to Personalized News Article Recommendation,"

The context for an arm is the concatenation of the observation feature vector and the arm feature vevctor.

Args

action_space : ActionSpace: the environment's action space
observation_dim : int: the number of dimensions in the observation feature vector
arm_feature_vector_dim : int: the number of dimensions in the feature representation of arms
reward_noise_sigma : float: the standard deviation of the noise added to the reward
simple_linear_mapping : bool: if True, reward is simply the sum of the arm features (debugging purposes)

Expand source code

class ContextualBanditLinearSyntheticEnvironment(ContextualBanditEnvironment):
    """
    A Contextual Bandit synthetic environment where the reward is linearly mapped from the
    context feature representation.
    The purpose of this environment is to simulate the behavior of a Contextual Bandit
    where rewards are modeled linearly from the context features.

    Following

    Lihong Li, Wei Chu, John Langford, Robert E. Schapire (2010),
    "A Contextual-Bandit Approach to Personalized News Article Recommendation,"

    The context for an arm is the concatenation of the observation feature vector
    and the arm feature vevctor.
    """

    def __init__(
        self,
        action_space: ActionSpace,
        observation_dim: int = 0,
        arm_feature_vector_dim: int = 4,
        reward_noise_sigma: float = 0.0,
        simple_linear_mapping: bool = False,
    ) -> None:
        """
        Args:
            action_space (ActionSpace): the environment's action space
            observation_dim (int): the number of dimensions in the observation feature vector
            arm_feature_vector_dim (int): the number of dimensions in the feature representation
                                          of arms
            reward_noise_sigma (float): the standard deviation of the noise added to the reward
            simple_linear_mapping (bool): if True, reward is simply the sum of the arm features
                                          (debugging purposes)
        """
        assert isinstance(action_space, DiscreteActionSpace)
        self._action_space: DiscreteActionSpace = action_space
        self.observation_dim = observation_dim
        self._arm_feature_vector_dim = arm_feature_vector_dim
        self.reward_noise_sigma = reward_noise_sigma
        self._simple_linear_mapping = simple_linear_mapping

        self._features_of_all_arms: torch.Tensor = self._generate_features_of_all_arms()
        self._linear_mapping: torch.nn.Module = self._make_initial_linear_mapping()
        self._observation: Optional[torch.Tensor] = None

    @property
    def action_space(self) -> ActionSpace:
        return self._action_space

    @property
    def arm_feature_vector_dim(self) -> int:
        return self._arm_feature_vector_dim

    @property
    def features_of_all_arms(self) -> torch.Tensor:
        return self._features_of_all_arms

    @property
    def linear_mapping(self) -> torch.nn.Module:
        return self._linear_mapping

    def _generate_features_of_all_arms(self) -> torch.Tensor:
        features_of_all_arms = torch.rand(
            self._action_space.n,
            self.arm_feature_vector_dim,
        )  # features of each arm. (num of action, num of features)
        return features_of_all_arms

    def _make_initial_linear_mapping(
        self,
    ) -> torch.nn.Module:
        """
        The function that maps context to reward (always linear).
        The input dimension (in_features) is observation dimension + arm feature vector dimension.
        The output (reward) is a scalar, so output dimension (out_features) is 1.
        """
        return torch.nn.Linear(
            in_features=self.observation_dim + self.arm_feature_vector_dim,
            out_features=1,
        )

    def reset(self, seed: Optional[int] = None) -> Tuple[Observation, ActionSpace]:
        """
        Provides the observation and action space to the agent.
        """
        self._observation = torch.rand(self.observation_dim)
        return self._observation, self.action_space

    def get_reward(self, action: Action) -> Value:
        """
        Given action, environment will return the reward associated of this action
        """
        # TODO: This assumes the action is an int tensor.
        context = self._get_context_for_arm(int(action.item()))
        reward = self._compute_reward_from_context(context)
        return reward

    def get_regret(self, action: Action) -> Value:
        """
        Given action, environment will return regret for choosing this action
        regret == max(reward over all action) - reward for current action
        """
        rewards = [
            self._compute_reward_from_context(self._get_context_for_arm(i))
            for i in range(self._action_space.n)
        ]
        # pyre-fixme[6]: For 1st argument expected
        #  `Iterable[Variable[SupportsRichComparisonT (bound to
        #  Union[SupportsDunderGT[typing.Any], SupportsDunderLT[typing.Any]])]]` but
        #  got `List[Tensor]`.
        # Requires greater cleanup
        return max(rewards) - rewards[action]

    def _get_context_for_arm(self, action: int) -> torch.Tensor:
        assert action in range(self._action_space.n)  # action is index in action_space
        assert self._observation is not None
        return torch.cat([self._observation, self.features_of_all_arms[action]])

    def _compute_reward_from_context(
        self,
        context: torch.Tensor,
    ) -> torch.Tensor:
        if self._simple_linear_mapping:
            reward = torch.sum(context).unsqueeze(dim=0)
            # assume the linear relationship between context and reward :
            # r_k = ONES @ g(f_k). This is convenient for debugging algorithms
            # when the algorithms are being developed.
        else:
            reward = self._compute_reward_from_context_using_linear_mapping(context)
        return reward

    def _compute_reward_from_context_using_linear_mapping(
        self,
        context: torch.Tensor,
    ) -> torch.Tensor:
        """
        We assume there is linear relationship between context and reward : r_k = g(f_k)
        The g() is a linear function mapping context to reward
        The output is reward, a Value.
        """
        # map features to rewards through a linear function W

        reward = self.linear_mapping(context)

        if self.reward_noise_sigma > 0.0:
            # add some Gaussian noise to each reward
            noise = torch.randn_like(reward) * self.reward_noise_sigma
            noisy_reward = reward + noise
            return noisy_reward
        else:
            return reward

    def __str__(self) -> str:
        return "Bandit with reward linearly mapped from context feature vector"

Ancestors

Instance variables

var arm_feature_vector_dim : int

Expand source code

@property
def arm_feature_vector_dim(self) -> int:
    return self._arm_feature_vector_dim

var features_of_all_arms : torch.Tensor

Expand source code

@property
def features_of_all_arms(self) -> torch.Tensor:
    return self._features_of_all_arms

var linear_mapping : torch.nn.modules.module.Module

Expand source code

@property
def linear_mapping(self) -> torch.nn.Module:
    return self._linear_mapping

Methods

def get_regret(self, action: torch.Tensor) ‑> object

Given action, environment will return regret for choosing this action regret == max(reward over all action) - reward for current action

Expand source code

def get_regret(self, action: Action) -> Value:
    """
    Given action, environment will return regret for choosing this action
    regret == max(reward over all action) - reward for current action
    """
    rewards = [
        self._compute_reward_from_context(self._get_context_for_arm(i))
        for i in range(self._action_space.n)
    ]
    # pyre-fixme[6]: For 1st argument expected
    #  `Iterable[Variable[SupportsRichComparisonT (bound to
    #  Union[SupportsDunderGT[typing.Any], SupportsDunderLT[typing.Any]])]]` but
    #  got `List[Tensor]`.
    # Requires greater cleanup
    return max(rewards) - rewards[action]

def get_reward(self, action: torch.Tensor) ‑> object

Given action, environment will return the reward associated of this action

Expand source code

def get_reward(self, action: Action) -> Value:
    """
    Given action, environment will return the reward associated of this action
    """
    # TODO: This assumes the action is an int tensor.
    context = self._get_context_for_arm(int(action.item()))
    reward = self._compute_reward_from_context(context)
    return reward

def reset(self, seed: Optional[int] = None) ‑> Tuple[object, ActionSpace]

Provides the observation and action space to the agent.

Expand source code

def reset(self, seed: Optional[int] = None) -> Tuple[Observation, ActionSpace]:
    """
    Provides the observation and action space to the agent.
    """
    self._observation = torch.rand(self.observation_dim)
    return self._observation, self.action_space

Inherited members

ContextualBanditEnvironment:
- action_space
- close
- render
- step