Module pearl.policy_learners.contextual_bandits.neural_linear_bandit
Expand source code
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Any, Dict, List, Optional
import torch
from pearl.api.action import Action
from pearl.api.action_space import ActionSpace
from pearl.history_summarization_modules.history_summarization_module import (
SubjectiveState,
)
from pearl.policy_learners.contextual_bandits.contextual_bandit_base import (
DEFAULT_ACTION_SPACE,
)
from pearl.policy_learners.contextual_bandits.neural_bandit import NeuralBandit
from pearl.policy_learners.exploration_modules.contextual_bandits.ucb_exploration import (
UCBExploration,
)
from pearl.policy_learners.exploration_modules.exploration_module import (
ExplorationModule,
)
from pearl.replay_buffers.transition import TransitionBatch
from pearl.utils.functional_utils.learning.action_utils import (
concatenate_actions_to_state,
)
from pearl.utils.functional_utils.learning.linear_regression import LinearRegression
from pearl.utils.instantiations.spaces.discrete_action import DiscreteActionSpace
class NeuralLinearBandit(NeuralBandit):
"""
Policy Learner for Contextual Bandit with:
features --> neural networks --> linear regression --> predicted rewards
"""
def __init__(
self,
feature_dim: int,
hidden_dims: List[int], # last one is the input dim for linear regression
exploration_module: ExplorationModule,
training_rounds: int = 100,
batch_size: int = 128,
learning_rate: float = 0.001,
l2_reg_lambda_linear: float = 1.0,
state_features_only: bool = False,
**kwargs: Any,
) -> None:
assert (
len(hidden_dims) >= 1
), "hidden_dims should have at least one value to specify feature dim for linear regression"
super(NeuralLinearBandit, self).__init__(
feature_dim=feature_dim,
hidden_dims=hidden_dims[:-1],
output_dim=hidden_dims[-1],
training_rounds=training_rounds,
batch_size=batch_size,
exploration_module=exploration_module,
state_features_only=state_features_only,
**kwargs,
)
# TODO specify linear regression type when needed
self._linear_regression = LinearRegression(
feature_dim=hidden_dims[-1],
l2_reg_lambda=l2_reg_lambda_linear,
)
self._linear_regression_dim: int = hidden_dims[-1]
def learn_batch(self, batch: TransitionBatch) -> Dict[str, Any]:
if self._state_features_only:
input_features = batch.state
else:
input_features = torch.cat([batch.state, batch.action], dim=1)
# forward pass
mlp_output = self._deep_represent_layers(input_features)
current_values = self._linear_regression(mlp_output)
expected_values = batch.reward
criterion = torch.nn.MSELoss()
loss = criterion(current_values.view(expected_values.shape), expected_values)
# Optimize the deep layer
# TODO how should we handle weight in NN training
self._optimizer.zero_grad()
loss.backward()
self._optimizer.step()
# Optimize linear regression
batch_weight = batch.weight
self._linear_regression.learn_batch(
mlp_output.detach(),
expected_values,
batch_weight,
)
return {"mlp_loss": loss.item(), "current_values": current_values.mean().item()}
def act(
self,
subjective_state: SubjectiveState,
action_space: ActionSpace,
action_availability_mask: Optional[torch.Tensor] = None,
exploit: bool = False,
) -> Action:
assert isinstance(action_space, DiscreteActionSpace)
# It doesnt make sense to call act if we are not working with action vector
assert isinstance(action_space, DiscreteActionSpace)
assert action_space.action_dim > 0
new_feature = concatenate_actions_to_state(
subjective_state=subjective_state,
action_space=action_space,
state_features_only=self._state_features_only,
)
mlp_values = self._deep_represent_layers(new_feature)
# `_linear_regression` is not nn.Linear().
# It is a customized linear layer
# that can be updated by analytical method (matrix calculations)
# rather than gradient descent of torch optimizer.
values = self._linear_regression(mlp_values)
# batch_size * action_count
assert values.numel() == new_feature.shape[0] * action_space.n
# subjective_state=mlp_values because uncertainty is only measure in the output linear layer
# revisit for other exploration module
return self._exploration_module.act(
subjective_state=mlp_values,
action_space=action_space,
values=values,
action_availability_mask=action_availability_mask,
representation=self._linear_regression,
)
def get_scores(
self,
subjective_state: SubjectiveState,
action_space: DiscreteActionSpace = DEFAULT_ACTION_SPACE,
) -> torch.Tensor:
# TODO generalize for all kinds of exploration module
feature = concatenate_actions_to_state(
subjective_state=subjective_state,
action_space=action_space,
state_features_only=self._state_features_only,
)
batch_size = feature.shape[0]
feature_dim = feature.shape[-1]
# dim: [batch_size * num_arms, feature_dim]
feature = feature.reshape(-1, feature_dim)
# dim: [batch_size, num_arms, feature_dim]
processed_feature = self._deep_represent_layers(feature)
# dim: [batch_size * num_arms, 1]
assert isinstance(self._exploration_module, UCBExploration)
scores = self._exploration_module.get_scores(
subjective_state=processed_feature,
values=self._linear_regression(processed_feature),
action_space=action_space,
representation=self._linear_regression,
)
# dim: [batch_size, num_arms] or [batch_size]
return scores.reshape(batch_size, -1).squeeze()
Classes
class NeuralLinearBandit (feature_dim: int, hidden_dims: List[int], exploration_module: ExplorationModule, training_rounds: int = 100, batch_size: int = 128, learning_rate: float = 0.001, l2_reg_lambda_linear: float = 1.0, state_features_only: bool = False, **kwargs: Any)
-
Policy Learner for Contextual Bandit with: features –> neural networks –> linear regression –> predicted rewards
Initializes internal Module state, shared by both nn.Module and ScriptModule.
Expand source code
class NeuralLinearBandit(NeuralBandit): """ Policy Learner for Contextual Bandit with: features --> neural networks --> linear regression --> predicted rewards """ def __init__( self, feature_dim: int, hidden_dims: List[int], # last one is the input dim for linear regression exploration_module: ExplorationModule, training_rounds: int = 100, batch_size: int = 128, learning_rate: float = 0.001, l2_reg_lambda_linear: float = 1.0, state_features_only: bool = False, **kwargs: Any, ) -> None: assert ( len(hidden_dims) >= 1 ), "hidden_dims should have at least one value to specify feature dim for linear regression" super(NeuralLinearBandit, self).__init__( feature_dim=feature_dim, hidden_dims=hidden_dims[:-1], output_dim=hidden_dims[-1], training_rounds=training_rounds, batch_size=batch_size, exploration_module=exploration_module, state_features_only=state_features_only, **kwargs, ) # TODO specify linear regression type when needed self._linear_regression = LinearRegression( feature_dim=hidden_dims[-1], l2_reg_lambda=l2_reg_lambda_linear, ) self._linear_regression_dim: int = hidden_dims[-1] def learn_batch(self, batch: TransitionBatch) -> Dict[str, Any]: if self._state_features_only: input_features = batch.state else: input_features = torch.cat([batch.state, batch.action], dim=1) # forward pass mlp_output = self._deep_represent_layers(input_features) current_values = self._linear_regression(mlp_output) expected_values = batch.reward criterion = torch.nn.MSELoss() loss = criterion(current_values.view(expected_values.shape), expected_values) # Optimize the deep layer # TODO how should we handle weight in NN training self._optimizer.zero_grad() loss.backward() self._optimizer.step() # Optimize linear regression batch_weight = batch.weight self._linear_regression.learn_batch( mlp_output.detach(), expected_values, batch_weight, ) return {"mlp_loss": loss.item(), "current_values": current_values.mean().item()} def act( self, subjective_state: SubjectiveState, action_space: ActionSpace, action_availability_mask: Optional[torch.Tensor] = None, exploit: bool = False, ) -> Action: assert isinstance(action_space, DiscreteActionSpace) # It doesnt make sense to call act if we are not working with action vector assert isinstance(action_space, DiscreteActionSpace) assert action_space.action_dim > 0 new_feature = concatenate_actions_to_state( subjective_state=subjective_state, action_space=action_space, state_features_only=self._state_features_only, ) mlp_values = self._deep_represent_layers(new_feature) # `_linear_regression` is not nn.Linear(). # It is a customized linear layer # that can be updated by analytical method (matrix calculations) # rather than gradient descent of torch optimizer. values = self._linear_regression(mlp_values) # batch_size * action_count assert values.numel() == new_feature.shape[0] * action_space.n # subjective_state=mlp_values because uncertainty is only measure in the output linear layer # revisit for other exploration module return self._exploration_module.act( subjective_state=mlp_values, action_space=action_space, values=values, action_availability_mask=action_availability_mask, representation=self._linear_regression, ) def get_scores( self, subjective_state: SubjectiveState, action_space: DiscreteActionSpace = DEFAULT_ACTION_SPACE, ) -> torch.Tensor: # TODO generalize for all kinds of exploration module feature = concatenate_actions_to_state( subjective_state=subjective_state, action_space=action_space, state_features_only=self._state_features_only, ) batch_size = feature.shape[0] feature_dim = feature.shape[-1] # dim: [batch_size * num_arms, feature_dim] feature = feature.reshape(-1, feature_dim) # dim: [batch_size, num_arms, feature_dim] processed_feature = self._deep_represent_layers(feature) # dim: [batch_size * num_arms, 1] assert isinstance(self._exploration_module, UCBExploration) scores = self._exploration_module.get_scores( subjective_state=processed_feature, values=self._linear_regression(processed_feature), action_space=action_space, representation=self._linear_regression, ) # dim: [batch_size, num_arms] or [batch_size] return scores.reshape(batch_size, -1).squeeze()
Ancestors
- NeuralBandit
- ContextualBanditBase
- PolicyLearner
- torch.nn.modules.module.Module
- abc.ABC
Inherited members