DemonstrationΒΆ
The purpose of this notebook is to demonstrate the use of G-learning with quadratic rewards for optimization of a defined contribution retirement plan. The demonstration is from [DHB20].
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
%matplotlib inline
import time
from scipy.optimize import minimize
import torch
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')
%env KMP_DUPLICATE_LIB_OK=TRUE
# set the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
env: KMP_DUPLICATE_LIB_OK=TRUE
# Define the G-learning portfolio optimization class
class G_learning_portfolio_opt:
def __init__(self,
num_steps,
params,
beta,
benchmark_portf,
gamma,
num_risky_assets,
riskfree_rate,
exp_returns, # array of shape num_steps x num_stocks
Sigma_r, # covariance matrix of returns of risky assets
init_x_vals, # array of initial asset position values (num_risky_assets + 1)
use_for_WM = True): # use for wealth management tasks
self.num_steps = num_steps
self.num_assets = num_risky_assets + 1
self.lambd = torch.tensor(params[0], requires_grad=False, dtype=torch.float64)
self.Omega_mat = params[1] * torch.eye(self.num_assets,dtype=torch.float64)
self.eta = torch.tensor(params[2], requires_grad=False, dtype=torch.float64)
self.rho = torch.tensor(params[3], requires_grad=False, dtype=torch.float64)
self.beta = torch.tensor(beta, requires_grad=False, dtype=torch.float64)
self.gamma = gamma
self.use_for_WM = use_for_WM
self.num_risky_assets = num_risky_assets
self.r_f = riskfree_rate
assert exp_returns.shape[0] == self.num_steps
assert Sigma_r.shape[0] == Sigma_r.shape[1]
assert Sigma_r.shape[0] == num_risky_assets # self.num_assets
self.Sigma_r_np = Sigma_r # array of shape num_stocks x num_stocks
self.reg_mat = 1e-3*torch.eye(self.num_assets, dtype=torch.float64)
# arrays of returns for all assets including the risk-free asset
# array of shape num_steps x (num_stocks + 1)
self.exp_returns_np = np.hstack((self.r_f * np.ones(self.num_steps).reshape((-1,1)), exp_returns))
# make block-matrix Sigma_r_tilde with Sigma_r_tilde[0,0] = 0, and equity correlation matrix inside
self.Sigma_r_tilde_np = np.zeros((self.num_assets, self.num_assets))
self.Sigma_r_tilde_np[1:,1:] = self.Sigma_r_np
# make Torch tensors
self.exp_returns = torch.tensor(self.exp_returns_np,requires_grad=False, dtype=torch.float64)
self.Sigma_r = torch.tensor(Sigma_r,requires_grad=False, dtype=torch.float64)
self.Sigma_r_tilde = torch.tensor(self.Sigma_r_tilde_np,requires_grad=False, dtype=torch.float64)
self.benchmark_portf = torch.tensor(benchmark_portf, requires_grad=False, dtype=torch.float64)
# asset holding values for all times. Initialize with initial values,
# values for the future times will be expected values
self.x_vals_np = np.zeros((self.num_steps, self.num_assets))
self.x_vals_np[0,:] = init_x_vals
# Torch tensor
self.x_vals = torch.tensor(self.x_vals_np)
# allocate memory for coefficients of R-, F- and G-functions
self.F_xx = torch.zeros(self.num_steps, self.num_assets, self.num_assets, dtype=torch.float64,
requires_grad=True)
self.F_x = torch.zeros(self.num_steps, self.num_assets, dtype=torch.float64,
requires_grad=True)
self.F_0 = torch.zeros(self.num_steps,dtype=torch.float64,requires_grad=True)
self.Q_xx = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.Q_uu = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.Q_ux = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.Q_x = torch.zeros(self.num_steps, self.num_assets,dtype=torch.float64,requires_grad=True)
self.Q_u = torch.zeros(self.num_steps, self.num_assets,dtype=torch.float64,requires_grad=True)
self.Q_0 = torch.zeros(self.num_steps,dtype=torch.float64,requires_grad=True)
self.R_xx = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.R_uu = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.R_ux = torch.zeros(self.num_steps, self.num_assets, self.num_assets,dtype=torch.float64,
requires_grad=True)
self.R_x = torch.zeros(self.num_steps, self.num_assets,dtype=torch.float64,requires_grad=True)
self.R_u = torch.zeros(self.num_steps, self.num_assets,dtype=torch.float64,requires_grad=True)
self.R_0 = torch.zeros(self.num_steps,dtype=torch.float64,requires_grad=True)
self.reset_prior_policy()
# the list of adjustable model parameters:
self.model_params = [self.lambd, self.beta, self.Omega_mat, self.eta]
# expected cash installment for all steps
self.expected_c_t = torch.zeros(self.num_steps,dtype=torch.float64)
# realized values of the target portfolio
self.realized_target_portf = np.zeros(self.num_steps,dtype=np.float64)
# expected portfolio values for all times
self.expected_portf_val = torch.zeros(self.num_steps,dtype=torch.float64)
# the first value is the sum of initial position values
self.expected_portf_val[0] = self.x_vals[0,:].sum()
def reset_prior_policy(self):
# initialize time-dependent parameters of prior policy
self.u_bar_prior = torch.zeros(self.num_steps,self.num_assets,requires_grad=False,
dtype=torch.float64)
self.v_bar_prior = torch.zeros(self.num_steps, self.num_assets, self.num_assets,requires_grad=False,
dtype=torch.float64)
self.Sigma_prior = torch.zeros(self.num_steps, self.num_assets, self.num_assets,requires_grad=False,
dtype=torch.float64)
self.Sigma_prior_inv = torch.zeros(self.num_steps, self.num_assets, self.num_assets,requires_grad=False,
dtype=torch.float64)
# make each time elements of v_bar_prior and Sigma_prior proportional to the unit matrix
for t in range(self.num_steps):
self.v_bar_prior[t,:,:] = 0.1 * torch.eye(self.num_assets).clone()
self.Sigma_prior[t,:,:] = 0.1 * torch.eye(self.num_assets).clone()
self.Sigma_prior_inv[t,:,:] = 10.0 * torch.eye(self.num_assets).clone() # np.linalg.inv(self.Sigma_prior[t,:,:])
def reward_fun(self, t, x_vals, u_vals, exp_rets, lambd, Sigma_hat):
"""
The reward function
"""
x_plus = x_vals + u_vals
p_hat = self.rho.clone() * self.benchmark_portf[t] + (1-self.rho.clone())*self.eta.clone()*x_vals.sum()
aux_1 = - self.lambd.clone() * p_hat**2
aux_2 = - u_vals.sum()
aux_3 = 2*self.lambd.clone() * p_hat * x_plus.dot(torch.ones(num_assets) + exp_rets)
aux_4 = - self.lambd.clone() * x_plus.mm(Sigma_hat.mv(x_plus))
aux_5 = - u_vals.mm(self.Omega_mat.clone().mv(u_vals))
return aux_1 + aux_2 + aux_3 + aux_4 + aux_5
def compute_reward_fun(self):
"""
Compute coefficients R_xx, R_ux, etc. for all steps
"""
for t in range(0, self.num_steps):
one_plus_exp_ret = torch.ones(self.num_assets,dtype=torch.float64) + self.exp_returns[t,:]
benchmark_portf = self.benchmark_portf[t]
Sigma_hat = self.Sigma_r_tilde + torch.ger(one_plus_exp_ret, one_plus_exp_ret)
one_plus_exp_ret_by_one = torch.ger(one_plus_exp_ret,torch.ones(self.num_assets,dtype=torch.float64))
one_plus_exp_ret_by_one_T = one_plus_exp_ret_by_one.t()
one_one_T_mat = torch.ones(self.num_assets,self.num_assets)
self.R_xx[t,:,:] = (-self.lambd.clone()*(self.eta.clone()**2)*(self.rho.clone()**2)*one_one_T_mat
+ 2*self.lambd.clone()*self.eta.clone()*self.rho.clone()*one_plus_exp_ret_by_one
- self.lambd.clone()*Sigma_hat)
self.R_ux[t,:,:] = (2*self.lambd.clone()*self.eta.clone()*self.rho.clone()*one_plus_exp_ret_by_one
- 2*self.lambd.clone()*Sigma_hat)
self.R_uu[t,:,:] = - self.lambd.clone() * Sigma_hat - self.Omega_mat.clone()
self.R_x[t,:] = (-2*self.lambd.clone()*self.eta.clone()*self.rho.clone()*(1-self.rho.clone())*benchmark_portf *
torch.ones(self.num_assets,dtype=torch.float64)
+ 2*self.lambd.clone()*(1-self.rho.clone())*benchmark_portf * one_plus_exp_ret)
self.R_u[t,:] = (2*self.lambd.clone()*(1-self.rho.clone())*benchmark_portf * one_plus_exp_ret
- torch.ones(self.num_assets,dtype=torch.float64))
self.R_0[t] = - self.lambd.clone()*((1-self.rho.clone())**2) * (benchmark_portf**2)
def project_cash_injections(self):
"""
Compute the expected values of future asset positions, and the expected cash injection for future steps,
as well as realized values of the target portfolio
"""
# this assumes that the policy is trained
for t in range(1, self.num_steps): # the initial value is fixed
# increment the previous x_t
delta_x_t = self.u_bar_prior[t,:] + self.v_bar_prior[t,:,:].mv(self.x_vals[t-1,:])
self.x_vals[t,:] = self.x_vals[t-1,:] + delta_x_t
# grow using the expected return
self.x_vals[t,:] = (torch.ones(self.num_assets)+ self.exp_returns[t,:])*self.x_vals[t,:]
# compute c_t
self.expected_c_t[t] = delta_x_t.sum().data # detach().numpy()
# expected portfolio value for this step
self.expected_portf_val[t] = self.x_vals[t,:].sum().data # .detach().numpy()
def set_terminal_conditions(self):
"""
set the terminal condition for the F-function
"""
# the auxiliary quantity to perform matrix calculations
one_plus_exp_ret = torch.ones(self.num_assets,dtype=torch.float64) + self.exp_returns[-1,:]
# Compute the reward function for all steps (only the last step is needed for this functions, while
# values for other time steps will be used in other functions)
self.compute_reward_fun()
if self.use_for_WM:
Sigma_hat = self.Sigma_r_tilde + torch.ger(one_plus_exp_ret, one_plus_exp_ret)
Sigma_hat_inv = torch.inverse(Sigma_hat + self.reg_mat)
Sigma_tilde = Sigma_hat + (1/self.lambd)*self.Omega_mat.clone()
Sigma_tilde_inv = torch.inverse(Sigma_tilde + self.reg_mat)
Sigma_hat_sigma_tilde = Sigma_hat.mm(Sigma_tilde)
Sigma_tilde_inv_sig_hat = Sigma_tilde_inv.mm(Sigma_hat)
Sigma_tilde_sigma_hat = Sigma_tilde.mm(Sigma_hat)
Sigma_hat_Sigma_tilde_inv = Sigma_hat.mm(Sigma_tilde_inv)
Sigma_3_plus_omega = self.lambd*Sigma_tilde_inv.mm(Sigma_hat_Sigma_tilde_inv) + self.Omega_mat.clone()
one_plus_exp_ret_by_one = torch.ger(one_plus_exp_ret,torch.ones(self.num_assets,dtype=torch.float64))
one_plus_exp_ret_by_one_T = one_plus_exp_ret_by_one.t()
one_one_T_mat = torch.ones(self.num_assets,self.num_assets)
Sigma_tilde_inv_t_R_ux = Sigma_tilde_inv.t().mm(self.R_ux[-1,:,:].clone())
Sigma_tilde_inv_t_R_uu = Sigma_tilde_inv.t().mm(self.R_uu[-1,:,:].clone())
Sigma_tilde_inv_t_R_u = Sigma_tilde_inv.t().mv(self.R_u[-1,:].clone())
Sigma_tilde_inv_R_u = Sigma_tilde_inv.mv(self.R_u[-1,:].clone())
Sigma_tilde_inv_R_ux = Sigma_tilde_inv.mm(self.R_ux[-1,:,:].clone())
Sigma_tilde_inv_t_R_uu = Sigma_tilde_inv.mm(self.R_uu[-1,:,:].clone())
# though the action at the last step is deterministic, we can feed
# parameters of the prior with these values
self.u_bar_prior[-1,:] = (1/(2 * self.lambd.clone()))* Sigma_tilde_inv.clone().mv(self.R_u[-1,:].clone())
self.v_bar_prior[-1,:,:] = (1/(2 * self.lambd.clone()))* Sigma_tilde_inv.clone().mm(self.R_ux[-1,:,:].clone())
# First compute the coefficients of the reward function F at the last step:
# F_xx
self.F_xx[-1,:,:] = (self.R_xx[-1,:,:].clone()
+ (1/(2*self.lambd.clone()))* self.R_ux[-1,:,:].clone().t().mm(Sigma_tilde_inv_t_R_ux)
+ (1/(4*self.lambd.clone()**2))* self.R_ux[-1,:,:].clone().t().mm(
Sigma_tilde_inv_t_R_uu.clone().mm(Sigma_tilde_inv.clone().mm(self.R_ux[-1,:,:].clone())))
)
# F_x
self.F_x[-1,:] = (self.R_x[-1,:].clone()
+ (1/(self.lambd.clone()))* self.R_ux[-1,:,:].clone().t().mv(Sigma_tilde_inv_t_R_u.clone())
+ (1/(2*self.lambd.clone()**2))* self.R_ux[-1,:,:].clone().t().mv(
Sigma_tilde_inv_t_R_uu.clone().mv(Sigma_tilde_inv_R_u.clone()))
)
# F_0
self.F_0[-1] = (self.R_0[-1].clone()
+ (1/(2*self.lambd.clone()))* self.R_u[-1,:].clone().dot(Sigma_tilde_inv_R_u.clone())
+ (1/(4*self.lambd.clone()**2))* self.R_u[-1,:].clone().dot(
Sigma_tilde_inv_t_R_uu.clone().mv(Sigma_tilde_inv_R_u.clone()))
)
# for the Q-function at the last step:
self.Q_xx[-1,:,:] = self.R_xx[-1,:,:].clone()
self.Q_ux[-1,:,:] = self.R_ux[-1,:,:].clone()
self.Q_uu[-1,:,:] = self.R_uu[-1,:,:].clone()
self.Q_u[-1,:] = self.R_u[-1,:].clone()
self.Q_x[-1,:] = self.R_x[-1,:].clone()
self.Q_0[-1] = self.R_0[-1].clone()
def G_learning(self, err_tol, max_iter):
"""
find the optimal policy for the time dependent policy
"""
print('Doing G-learning, it may take a few seconds...')
# set terminal conditions
self.set_terminal_conditions()
# allocate iteration numbers for all steps
self.iter_counts = np.zeros(self.num_steps)
# iterate over time steps backward
for t in range(self.num_steps-2,-1,-1):
self.step_G_learning(t, err_tol, max_iter)
def step_G_learning(self, t, err_tol, max_iter):
"""
Perform one step of backward iteration for G-learning self-consistent equations
This should start from step t = num_steps - 2 (i.e. from a step that is before the last one)
"""
# make matrix Sigma_hat_t
one_plus_exp_ret = torch.ones(self.num_assets,dtype=torch.float64) + self.exp_returns[t,:]
Sigma_hat_t = self.Sigma_r_tilde + torch.ger(one_plus_exp_ret, one_plus_exp_ret)
# matrix A_t = diag(1 + r_bar_t)
A_t = torch.diag(torch.ones(self.num_assets,dtype=torch.float64) + self.exp_returns[t,:])
# update parameters of Q_function using next-step F-function values
self.update_Q_params(t, A_t,Sigma_hat_t)
# iterate between policy evaluation and policy improvement
while self.iter_counts[t] < max_iter:
curr_u_bar_prior = self.u_bar_prior[t,:].clone()
curr_v_bar_prior = self.v_bar_prior[t,:,:].clone()
# compute parameters of F-function for this step from parameters of Q-function
self.update_F_params(t)
# Policy iteration step: update parameters of the prior policy distribution
# with given Q- and F-function parameters
self.update_policy_params(t)
# difference between the current value of u_bar_prior and the previous one
err_u_bar = torch.sum((curr_u_bar_prior - self.u_bar_prior[t,:])**2)
# divide by num_assets in err_v_bar to get both errors on a comparable scale
err_v_bar = (1/self.num_assets)*torch.sum((curr_v_bar_prior - self.v_bar_prior[t,:,:])**2)
# choose the difference from the previous iteration as the maximum of the two errors
tol = torch.max(err_u_bar, err_v_bar) # tol = 0.5*(err_u_bar + err_v_bar)
self.iter_counts[t] += 1
# Repeat the calculation of Q- and F-values
if tol <= err_tol:
break
def update_Q_params(self,t, A_t,Sigma_hat_t):
"""
update the current (time-t) parameters of Q-function from (t+1)-parameters of F-function
"""
ones = torch.ones(self.num_assets,dtype=torch.float64)
one_plus_exp_ret = torch.ones(self.num_assets,dtype=torch.float64) + self.exp_returns[t,:]
self.Q_xx[t,:,:] = (self.R_xx[t,:,:].clone()
+ self.gamma *( (A_t.clone().mm(self.F_xx[t+1,:,:].clone())).mm(A_t.clone())
+ self.Sigma_r_tilde.clone() * self.F_xx[t+1,:,:].clone() ) )
self.Q_ux[t,:,:] = (self.R_ux[t,:,:].clone()
+ 2 * self.gamma *( (A_t.clone().mm(self.F_xx[t+1,:,:].clone())).mm(A_t.clone())
+ self.Sigma_r_tilde.clone() * self.F_xx[t+1,:,:].clone() )
)
self.Q_uu[t,:,:] = (self.R_uu[t,:,:].clone()
+ self.gamma *( (A_t.clone().mm(self.F_xx[t+1,:,:].clone())).mm(A_t.clone())
+ self.Sigma_r_tilde.clone() * self.F_xx[t+1,:,:].clone() )
- self.Omega_mat.clone()
)
self.Q_x[t,:] = self.R_x[t,:].clone() + self.gamma * A_t.clone().mv(self.F_x[t+1,:].clone())
self.Q_u[t,:] = self.R_u[t,:].clone() + self.gamma * A_t.clone().mv(self.F_x[t+1,:].clone())
self.Q_0[t] = self.R_0[t].clone() + self.gamma * self.F_0[t+1].clone()
def update_F_params(self,t):
"""
update the current (time-t) parameters of F-function from t-parameters of G-function
This is a policy evaluation step: it uses the current estimations of the mean parameters of the policy
"""
# produce auxiliary parameters U_t, W_t, Sigma_tilde_t
U_t = (self.beta.clone() * self.Q_ux[t,:,:].clone()
+ self.Sigma_prior_inv[t,:,:].clone().mm(self.v_bar_prior[t,:,:].clone()))
W_t = (self.beta.clone() * self.Q_u[t,:].clone()
+ self.Sigma_prior_inv[t,:,:].clone().mv(self.u_bar_prior[t,:]).clone())
Sigma_p_bar = self.Sigma_prior_inv[t,:,:].clone() - 2 * self.beta.clone() * self.Q_uu[t,:,:].clone()
Sigma_p_bar_inv = torch.inverse(Sigma_p_bar + self.reg_mat)
# update parameters of F-function
self.F_xx[t,:,:] = self.Q_xx[t,:,:].clone() + (1/(2*self.beta.clone()))*(U_t.t().mm(Sigma_p_bar_inv.clone().mm(U_t))
- self.v_bar_prior[t,:,:].clone().t().mm(
self.Sigma_prior_inv[t,:,:].clone().mm(self.v_bar_prior[t,:,:].clone())))
self.F_x[t,:] = self.Q_x[t,:].clone() + (1/self.beta.clone())*(U_t.mv(Sigma_p_bar_inv.clone().mv(W_t))
- self.v_bar_prior[t,:,:].clone().mv(
self.Sigma_prior_inv[t,:,:].clone().mv(self.u_bar_prior[t,:].clone())))
self.F_0[t] = self.Q_0[t].clone() + ( (1/(2*self.beta.clone()))*(W_t.dot(Sigma_p_bar_inv.clone().mv(W_t))
- self.u_bar_prior[t,:].clone().dot(
self.Sigma_prior_inv[t,:,:].clone().mv(self.u_bar_prior[t,:].clone())))
- (1/(2*self.beta.clone())) * (torch.log(torch.det(self.Sigma_prior[t,:,:].clone()+
self.reg_mat))
- torch.log(torch.det(Sigma_p_bar_inv.clone() + self.reg_mat))) )
def update_policy_params(self,t):
"""
update parameters of the Gaussian policy using current coefficients of the F- and G-functions
"""
new_Sigma_prior_inv = self.Sigma_prior_inv[t,:,:].clone() - 2 * self.beta.clone() * self.Q_uu[t,:,:].clone()
Sigma_prior_new = torch.inverse(new_Sigma_prior_inv + self.reg_mat)
# update parameters using the previous value of Sigma_prior_inv
self.u_bar_prior[t,:] = Sigma_prior_new.mv(self.Sigma_prior_inv[t,:,:].clone().mv(self.u_bar_prior[t,:].clone())
+ self.beta.clone() * self.Q_u[t,:].clone())
self.v_bar_prior[t,:,:] = Sigma_prior_new.clone().mm(self.Sigma_prior_inv[t,:,:].clone().mm(self.v_bar_prior[t,:,:].clone())
+ self.beta.clone() * self.Q_ux[t,:,:].clone())
# and then assign the new inverse covariance for the prior for the next iteration
self.Sigma_prior[t,:,:] = Sigma_prior_new.clone()
self.Sigma_prior_inv[t,:,:] = new_Sigma_prior_inv.clone()
# also assign the same values for the previous time step
if t > 0:
self.Sigma_prior[t-1,:,:] = self.Sigma_prior[t,:,:].clone()
self.u_bar_prior[t-1,:] = self.u_bar_prior[t,:].clone()
self.v_bar_prior[t-1,:,:] = self.v_bar_prior[t,:,:].clone()
def trajs_to_torch_tensors(self,trajs):
"""
Convert data from a list of lists into Torch tensors
"""
num_trajs = len(trajs)
self.data_xvals = torch.zeros(num_trajs,self.num_steps,self.num_assets,dtype=torch.float64)
self.data_uvals = torch.zeros(num_trajs,self.num_steps,self.num_assets,dtype=torch.float64)
for n in range(num_trajs):
for t in range(self.num_steps):
self.data_xvals[n,t,:] = torch.tensor(trajs[n][t][0],dtype=torch.float64).clone()
self.data_uvals[n,t,:] = torch.tensor(trajs[n][t][1],dtype=torch.float64).clone()
def compute_reward_on_traj(self,
t,
x_t, u_t):
"""
Given time t and corresponding values of vectors x_t, u_t, compute the total reward for this step
"""
aux_xx = x_t.dot(self.R_xx[t,:,:].clone().mv(x_t))
aux_ux = u_t.dot(self.R_ux[t,:,:].clone().mv(x_t))
aux_uu = u_t.dot(self.R_uu[t,:,:].clone().mv(u_t))
aux_x = x_t.dot(self.R_x[t,:].clone())
aux_u = u_t.dot(self.R_u[t,:].clone())
aux_0 = self.R_0[t].clone()
return aux_xx + aux_ux + aux_uu + aux_x + aux_u + aux_0
def compute_G_fun_on_traj(self,
t,
x_t, u_t):
"""
Given time t and corresponding values of vectors x_t, u_t, compute the total reward for this step
"""
aux_xx = x_t.dot(self.Q_xx[t,:,:].clone().mv(x_t))
aux_ux = u_t.dot(self.Q_ux[t,:,:].clone().mv(x_t))
aux_uu = u_t.dot(self.Q_uu[t,:,:].clone().mv(u_t))
aux_x = x_t.dot(self.Q_x[t,:].clone())
aux_u = u_t.dot(self.Q_u[t,:].clone())
aux_0 = self.Q_0[t].clone()
return aux_xx + aux_ux + aux_uu + aux_x + aux_u + aux_0
def compute_F_fun_on_traj(self,
t,
x_t):
"""
Given time t and corresponding values of vectors x_t, u_t, compute the total reward for this step
"""
aux_xx = x_t.dot(self.F_xx[t,:,:].clone().mv(x_t))
aux_x = x_t.dot(self.F_x[t,:].clone())
aux_0 = self.F_0[t].clone()
return aux_xx + aux_x + aux_0
def MaxEntIRL(self,
trajs,
learning_rate,
err_tol, max_iter):
"""
Estimate parameters of the reward function using MaxEnt IRL.
Inputs:
trajs - a list of trajectories. Each trajectory is a list of state-action pairs, stored as a tuple.
We assume each trajectory has the same length
"""
# omega is a tunable parameter that determines the cost matrix self.Omega_mat
omega_init = 15.0
self.omega = torch.tensor(omega_init, requires_grad=True, dtype=torch.float64)
beta_init = 50 # Beta is fixed and not a learned parameter.
self.beta = torch.tensor(beta_init, requires_grad=True, dtype=torch.float64)
reward_params = [self.lambd, self.eta, self.rho, self.omega, self.beta]
print("Omega mat...")
self.Omega_mat = self.omega * torch.eye(self.num_assets,dtype=torch.float64)
print("g learning...")
self.reset_prior_policy()
self.G_learning(err_tol, max_iter)
print("intialize optimizer...")
optimizer = optim.Adam(reward_params, lr=learning_rate)
print("zero grad...")
optimizer.zero_grad()
num_trajs = len(trajs)
print("trajs_to_torch_tensors...")
# fill in Torch tensors for the trajectory data
self.trajs_to_torch_tensors(trajs)
print("constructing zero tensors...")
self.realized_rewards = torch.zeros(num_trajs,self.num_steps,dtype=torch.float64,requires_grad=True)
self.realized_cum_rewards = torch.zeros(num_trajs, dtype=torch.float64, requires_grad=True)
print("constructing zero tensors...")
self.realized_G_fun = torch.zeros(num_trajs,self.num_steps,dtype=torch.float64, requires_grad=True)
self.realized_F_fun = torch.zeros(num_trajs,self.num_steps,dtype=torch.float64, requires_grad=True)
print("constructing zero tensors...")
self.realized_G_fun_cum = torch.zeros(num_trajs,dtype=torch.float64, requires_grad=True)
self.realized_F_fun_cum = torch.zeros(num_trajs,dtype=torch.float64, requires_grad=True)
print("done...")
num_iter_IRL = 3
for i in range(num_iter_IRL):
print('GIRL iteration = ', i)
self.Omega_mat = self.omega * torch.eye(self.num_assets,dtype=torch.float64)
for n in range(101):
if n%100==0:
print(n)
for t in range(self.num_steps):
# compute rewards obtained at each step for each trajectory
# given the model parameters
self.realized_rewards[n,t] = self.compute_reward_on_traj(t,
self.data_xvals[n,t,:],
self.data_uvals[n,t,:])
# compute the log-likelihood by looping over trajectories
self.realized_G_fun[n,t] = self.compute_G_fun_on_traj(t,
self.data_xvals[n,t,:],
self.data_uvals[n,t,:])
self.realized_F_fun[n,t] = self.compute_F_fun_on_traj(t,
self.data_xvals[n,t,:])
self.realized_cum_rewards[n] = self.realized_rewards[n,:].sum().clone()
self.realized_G_fun_cum[n] = self.realized_G_fun[n,:].sum().clone()
self.realized_F_fun_cum[n] = self.realized_F_fun[n,:].sum().clone()
# the negative log-likelihood will not include terms ~ Sigma_p as we do not optimize over its value
loss = - self.beta.clone()*(self.realized_G_fun_cum.sum().clone() - self.realized_F_fun_cum.sum().clone())
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Iteration = ', i)
print('Loss = ', loss.detach().numpy())
print('Done optimizing reward parameters')
Simulate portfolio dataΒΆ
Simulate the market factor under a lognormal distribution with a fixed drift and volΒΆ
mu_market = 0.05
vol_market = 0.25
init_market_val = 100.0
r_rf = 0.02 # risk-free rate - the first asset will be cash
num_steps = 10 # number of steps for planning horizon
dt = 0.25 # quarterly time steps
num_risky_assets = 99 # 100
returns_market = np.zeros(num_steps)
market_vals = np.zeros(num_steps)
market_vals[0] = 100.0 # initial value
for t in range(1,num_steps):
rand_norm = np.random.standard_t(4)
# use log-returns of market as 'returns_market'
returns_market[t] = mu_market * dt + vol_market * np.sqrt(dt) * rand_norm
market_vals[t] = market_vals[t-1] * np.exp((mu_market - 0.5*vol_market**2)*dt +
vol_market*np.sqrt(dt)*rand_norm)
Simulate market betas and idiosyncratic alphas within pre-defined rangesΒΆ
beta_min = 0.05
beta_max = 0.85
beta_vals = np.random.uniform(low=beta_min, high=beta_max, size=num_risky_assets)
alpha_min = - 0.05
alpha_max = 0.15
alpha_vals = np.random.uniform(low=alpha_min, high=alpha_max, size=num_risky_assets)
data = {"alpha": alpha_vals,
"beta" : beta_vals}
pd.DataFrame(data, columns=["alpha", "beta"]).head()
alpha | beta | |
---|---|---|
0 | 0.124569 | 0.387554 |
1 | 0.142908 | 0.293034 |
2 | -0.031644 | 0.238275 |
3 | 0.060698 | 0.550975 |
4 | 0.052559 | 0.546520 |
Simulate time-dependent expected returnsΒΆ
# Time-independent expected returns would be equal to alpha + beta * expected_market_return
# Make them time-dependent (and correlated with actual returns) as alpha + beta * oracle_market_returns
oracle_coeff = 0.2
mu_vec = mu_market * np.ones(num_steps)
oracle_market_returns = mu_vec * dt + oracle_coeff*(returns_market - mu_vec)
expected_risky_returns = np.zeros((num_steps, num_risky_assets))
for t in range(num_steps):
expected_risky_returns[t,:] = alpha_vals * dt + beta_vals * oracle_market_returns[t]
Initial values of all assetsΒΆ
val_min = 20.0
val_max = 120.0
init_risky_asset_vals = np.random.uniform(low=val_min, high=val_max, size=num_risky_assets)
Simulate realized returns and asset pricesΒΆ
# Generate realized returns and realized asset values by simulating from a one-factor model
# with time-dependent expected returns
risky_asset_returns = np.zeros((num_steps, num_risky_assets))
risky_asset_vals = np.zeros((num_steps, num_risky_assets))
idiosync_vol = 0.05 # vol_market
for t in range(num_steps):
rand_norm = np.random.randn(num_risky_assets)
# asset returns are simulated from a one-factor model
risky_asset_returns[t,:] = (expected_risky_returns[t,:] + beta_vals * (returns_market[t] - mu_market * dt)
+ idiosync_vol * np.sqrt(1 - beta_vals**2) * np.sqrt(dt) * rand_norm)
# asset values
if t == 0:
risky_asset_vals[t,:] = init_risky_asset_vals
else:
risky_asset_vals[t] = risky_asset_vals[t-1] * (1 + risky_asset_returns[t,:])
import plotly.express as px
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y = expected_risky_returns.T[0],
name = 'expected return'))
fig.add_trace(go.Scatter(y = risky_asset_returns.T[0],
name = 'realized return'))
fig.update_layout(title='Realized returns vs expected returns',
xaxis_title='Time Steps',
yaxis_title='return')
fig.show()
Compute the empirical correlation matrix using realized returnsΒΆ
cov_mat_r = np.cov(risky_asset_returns.T)
print(cov_mat_r.shape)
D, v = np.linalg.eigh(cov_mat_r)
eigenvals = D[::-1] # put them in a descended order
(99, 99)
# eigenvalues: the largest eigenvalue is the market factor
pd.DataFrame(eigenvals, columns=["eigenvals"]).head()
eigenvals | |
---|---|
0 | 0.502850 |
1 | 0.007812 |
2 | 0.007644 |
3 | 0.007173 |
4 | 0.005576 |
cov_mat_torch = torch.tensor(cov_mat_r)
pd.DataFrame(torch.pinverse(cov_mat_torch)[0], columns=["variance and covariance of first stock"]).head()
variance and covariance of first stock | |
---|---|
0 | 18.760736 |
1 | -4.375317 |
2 | 13.224230 |
3 | -12.367550 |
4 | -3.996179 |
Add a riskless bond as one more assetΒΆ
num_assets = num_risky_assets + 1
bond_val = 100.0
# add the bond to initial assets
init_asset_vals = np.hstack((np.array([bond_val]),
init_risky_asset_vals))
Make the initial portfolioΒΆ
# consider here two choices: equal or equally-weighted
init_port_choice = 'equal'
init_cash = 1000.0
init_total_asset = np.sum(init_asset_vals)
x_vals_init = np.zeros(num_assets)
if init_port_choice == 'equal':
# hold equal amounts of cash in each asset
amount_per_asset = init_cash/num_assets
x_vals_init = amount_per_asset * np.ones(num_assets)
elif init_port_choice == 'equally_weighted':
amount_per_asset = init_cash/init_total_asset
x_vals_init = amount_per_asset * init_asset_vals
Make the target portfolioΒΆ
# Generate a target portfolio term structure by defining it as
# the initial portfolio growing at some fixed and high rate
target_portfolio = [init_cash]
target_return = 0.15
coeff_target = 1.1
for i in range(1,num_steps):
target_portfolio.append(target_portfolio[i-1]*np.exp(dt * target_return) )
target_portfolio = coeff_target*np.array(target_portfolio)
print(target_portfolio[0], target_portfolio[-1])
1100.0 1541.5835692311712
Define model parametersΒΆ
riskfree_rate = 0.02
fee_bond = 0.05 # 0.01 #
fee_stock = 0.05 # 0.05 # 20.0 # 0.1 # 1.0 # 100 # 1.0 # 0.5
all_fees = np.zeros(num_risky_assets + 1)
all_fees[0] = fee_bond
all_fees[1:] = fee_stock
Omega_mat = np.diag(all_fees)
# model parameters
lambd = 0.001
Omega_mat = 15.5 * np.diag(all_fees)
eta = 1.5
beta = 100.0
gamma = 0.95
exp_returns = expected_risky_returns
Sigma_r = cov_mat_r
# Generate the benchmark target portfolio by growing the initial portfolio value at rate eta
target_return = 0.5
benchmark_portf = [ init_cash * np.exp(dt * target_return)]
rho = 0.4
for i in range(1,num_steps):
benchmark_portf.append(benchmark_portf[i-1]*np.exp(dt * target_return) )
print(benchmark_portf[0], benchmark_portf[-1])
1133.1484530668263 3490.3429574618413
Simulate portfolio dataΒΆ
Produce a list of trajectories, where each trajectory is a list made of state-action pairs
lambd = 0.001
omega = 1.0
beta = 1000.0
eta = 1.5 # 1.3 # 1.5 # 1.2
rho = 0.4
reward_params=[lambd, omega, eta, rho]
# Create a G-learner
G_learner = G_learning_portfolio_opt(num_steps,
reward_params,
beta,
benchmark_portf,
gamma,
num_risky_assets,
riskfree_rate,
expected_risky_returns, # array of shape num_steps x num_stocks
Sigma_r, # covariance matrix of returns of risky matrix
x_vals_init, # array of initial values of len (num_stocks+1)
use_for_WM = True) # use for wealth management tasks
G_learner.reset_prior_policy()
error_tol=1.e-8
max_iter_RL = 200
G_learner.G_learning(error_tol, max_iter_RL)
Doing G-learning, it may take a few seconds...
num_sim = 1000
trajs = []
np.random.seed(0)
torch.manual_seed(0)
t_0 = time.time()
x_vals = [x_vals_init]
returns_all = []
for n in range(num_sim):
this_traj = []
x_t = x_vals_init[:]
returns_array = []
for t in range(0,num_steps):
mu_t = G_learner.u_bar_prior[t,:] + G_learner.v_bar_prior[t,:].mv(torch.tensor(x_t))
u_t = np.random.multivariate_normal(mu_t.detach().numpy(), G_learner.Sigma_prior[t,:].detach().numpy())
# compute new values of x_t
x_next = x_t +u_t
# grow this with random return
idiosync_vol = 0.05 # vol_market
rand_norm = np.random.randn(num_risky_assets)
# asset returns are simulated from a one-factor model
risky_asset_returns = (expected_risky_returns[t,:] + beta_vals * (returns_market[t] - mu_market * dt)
+ idiosync_vol * np.sqrt(1 - beta_vals**2) * np.sqrt(dt) * rand_norm)
returns = np.hstack((riskfree_rate*dt, risky_asset_returns))
x_next = (1+returns)*x_next
port_returns=(x_next.sum() -x_t.sum() -np.sum(u_t) - 0.015*np.abs(u_t).sum())/x_t.sum()
this_traj.append((x_t, u_t))
# rename
x_t = x_next
returns_array.append(port_returns)
# end the loop over time steps
trajs.append(this_traj)
returns_all.append(returns_array)
print('Done simulating trajectories in %f sec'% (time.time() - t_0))
Done simulating trajectories in 24.812280 sec
Calculate performance of G-learner (Diagnostics only)ΒΆ
returns_all_G = returns_all
SR_G = 0
for i in range(num_sim):
SR_G += (np.mean(returns_all_G[i])-riskfree_rate*dt)/np.std(returns_all_G[i])
SR_G/=num_sim
print(SR_G)
-0.3174297174859997
r_G = np.array([0]*num_steps, dtype='float64')
for n in range(num_steps):
for i in range(num_sim):
r_G[n]+=returns_all_G[i][n]
r_G[n]/=num_sim
import plotly.express as px
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y = r_G,
name = 'expected return'))
fig.update_layout(title='Resultes: averaged returns of G-learner trajectories',
xaxis_title='Time Steps',
yaxis_title='Sample Mean Returns')
fig.show()
np.save('State_act_trajs.npy', trajs)
#trajs = np.load('State_act_trajs.npy')
Plot rewards and cash installmentsΒΆ
We can plot the learng rewards and observe the required cash installements from the G-learner model
reward_params=[lambd, omega, eta, rho]
G_learner = G_learning_portfolio_opt(num_steps,
reward_params,
beta,
benchmark_portf,
gamma,
num_risky_assets,
riskfree_rate,
expected_risky_returns,
Sigma_r,
x_vals_init,
use_for_WM=True)
err_tol = 1.e-10
max_iter = 500
t_0 = time.time()
G_learner.reset_prior_policy()
G_learner.G_learning(err_tol=err_tol, max_iter=max_iter)
num_trajs = len(trajs)
data_xvals = torch.zeros(num_trajs, num_steps, num_assets, dtype=torch.float64, requires_grad=False)
data_uvals = torch.zeros(num_trajs, num_steps, num_assets, dtype=torch.float64, requires_grad=False)
num_trajs = len(trajs)
for n in range(num_trajs):
for t in range(num_steps):
data_xvals[n,t,:] = torch.tensor(trajs[n][t][0],dtype=torch.float64)
data_uvals[n,t,:] = torch.tensor(trajs[n][t][1],dtype=torch.float64)
realized_rewards = torch.zeros(num_trajs, num_steps, dtype=torch.float64, requires_grad=False)
realized_cum_rewards = torch.zeros(num_trajs, dtype=torch.float64, requires_grad=False)
realized_G_fun = torch.zeros(num_trajs, num_steps, dtype=torch.float64, requires_grad=False)
realized_F_fun = torch.zeros(num_trajs, num_steps, dtype=torch.float64, requires_grad=False)
realized_G_fun_cum = torch.zeros(num_trajs, dtype=torch.float64, requires_grad=False)
realized_F_fun_cum = torch.zeros(num_trajs, dtype=torch.float64, requires_grad=False)
# compute the rewards and realized values of G- and F-functions from
# all trajectories
for n in range(num_trajs):
for t in range(num_steps):
realized_rewards[n,t] = G_learner.compute_reward_on_traj(t,
data_xvals[n,t,:], data_uvals[n,t,:])
realized_G_fun[n,t] = G_learner.compute_G_fun_on_traj(t,
data_xvals[n,t,:], data_uvals[n,t,:])
realized_F_fun[n,t] = G_learner.compute_F_fun_on_traj(t,
data_xvals[n,t,:])
realized_cum_rewards[n] = realized_rewards[n,:].sum()
realized_G_fun_cum[n] = realized_G_fun[n,:].sum()
realized_F_fun_cum[n] = realized_F_fun[n,:].sum()
print('Done in %f sec'% (time.time() - t_0))
Doing G-learning, it may take a few seconds...
Done in 8.024213 sec
import plotly.express as px
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y = realized_rewards.detach().numpy()[1],
name = 'realized rewards'))
fig.update_layout(title='Realized rewards',
xaxis_title='Time Steps',
yaxis_title='Realized rewards')
import plotly.express as px
import plotly.graph_objects as go
G_learner.project_cash_injections()
eta_ = G_learner.eta.detach().numpy()
realized_target_portf = eta_ * G_learner.expected_portf_val.numpy()
fig = go.Figure()
fig.add_trace(go.Scatter(y = G_learner.expected_c_t,
name = 'optimal cash installments'))
fig.add_trace(go.Scatter(y = G_learner.expected_portf_val,
name = 'expected portfolio value'))
fig.add_trace(go.Scatter(y = realized_target_portf,
name = 'realized target portfolio'))
fig.update_layout(title='Realized rewards',
xaxis_title='Time Steps',
yaxis_title='Realized rewards')
Show the histogram of cumulative rewards
import plotly.express as px
fig = px.histogram(realized_cum_rewards.detach().numpy(), nbins=20)
fig.show()