Skip to content
Snippets Groups Projects
Commit df53bb34 authored by Doga Keskin's avatar Doga Keskin
Browse files

FIXED BUGS

parent 6d9d44ea
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import torch
import read_maze as maze
from enum import Enum
import torch.optim
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
```
%% Cell type:code id: tags:
``` python
maze.load_maze()
```
%% Cell type:code id: tags:
``` python
class ActionEnum(Enum):
NOTHING = 0
UP = 1
RIGHT = 2
DOWN = 3
LEFT = 4
class envirnoment():
def __init__(self,start, end):
self.start = torch.clone(start)
self.location = torch.clone(start)
self.end = torch.clone(end)
self.around = torch.from_numpy(maze.get_local_maze_information(*self.location))
self.action_space = 5
self.time_step = torch.tensor([0])
def reset(self):
self.location = torch.clone(self.start)
self.around = torch.from_numpy(maze.get_local_maze_information(*self.location))
self.time_step = torch.tensor([0])
state = torch.cat([self.location.unsqueeze(0), self.around.view(-1,18), self.time_step.unsqueeze(0)],dim=1)
return state, self.location, self.around
def take_action(self, action):
# NEXT IS WHERE THE AGENT WILL LAND AFTER PERFORMING AN ACTION
next = torch.clone(self.location)
reward = 0
if action == 1:
if self.around[1][0][0] == 1:# and self.around[0][1][1] == 0:
next[1] += -1
reward -= 1
#print('up')
elif action == 2:
if self.around[2][1][0] == 1:# and self.around[1][2][1] == 0:
next[0] += 1
reward += 1
#print('right')
elif action == 3:
if self.around[1][2][0] == 1:# and self.around[2][1][1] == 0:
next[1] += 1
reward += 1
#print('down')
elif action == 4:
if self.around[0][1][0] == 1:# and self.around[1][0][1] == 0:
next[0] += -1
reward -= 1
#print('left')
# IF THE AGENT HAS NOT CHANGED LOCATION THIS ACTION
if torch.equal(self.location, next):
reward = -0.5
self.location = torch.clone(next)
self.around = torch.from_numpy(maze.get_local_maze_information(*self.location))
# IF THE AGENT SUCCESSFULLY GETS TO THE GOAL
done = False
if torch.equal(self.location, self.end):
reward = 10
done = True
# SUBTRACT TIME BASED PENALTY FROM REWARD
reward -= self.time_step * 0.01
self.time_step +=1
state = torch.cat([self.location.unsqueeze(0), self.around.view(-1,18), self.time_step.unsqueeze(0)],dim=1)
return state, reward, done, self.location, self.around
```
%% Cell type:code id: tags:
``` python
def print_maze(x,y):
k = maze.get_local_maze_information(x,y)
s=''
for i in k[0]:
if i[0] == 1:
s+='O '
else:
s+= 'X '
print(s)
s =''
for i in k[1]:
if i[0] == 1:
s+='O '
else:
s+= 'X '
print(s)
s =''
for i in k[2]:
if i[0] == 1:
s+='O '
else:
s+= 'X '
print(s)
```
%% Cell type:code id: tags:
``` python
def print_alt_maze(x,y):
k = maze.get_local_maze_information(x,y)
r1=''
r2=''
r3=''
for i in k:
if i[0][0]==1:
r1 += 'O '
else:
r1 += 'X '
if i[1][0]==1:
r2 += 'O '
else:
r2 += 'X '
if i[2][0]==1:
r3 += 'O '
else:
r3 += 'X '
print('======')
print(r1)
print(r2)
print(r3)
print('======')
env2 = envirnoment(torch.tensor([1,1]), torch.tensor([2,1]) )
```
%% Cell type:code id: tags:
``` python
l =env2.take_action(2)
print(env2.start)
print_alt_maze(*env2.location)
print(l[0])
print(l[1])
# w = torch.cat([l[0].unsqueeze(0), l[1].view(-1,18),env2.time_step.unsqueeze(0)],dim=1)
# print(w)
```
%% Output
tensor([1, 1])
======
X X X
O O O
O X X
======
tensor([[2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])
tensor([10.])
%% Cell type:code id: tags:
``` python
class ActorCritic(nn.Module):
def __init__(self, in_size, num_of_actions, gru_hidden_size = 64):
super(ActorCritic, self).__init__()
self.gru_hidden_size = gru_hidden_size
self.gru = nn.GRU(in_size,64,1,batch_first=False,bidirectional=False)
self.fc1 = nn.Linear(64, 128)
self.actor = nn.Linear(128, num_of_actions)
self.critic = nn.Linear(128,1)
def forward(self, x, h):
out, h = self.gru(x, h)
if out.dim()==3:
out = out.squeeze(0)
out = torch.relu(out)
out = self.fc1(out)
out = torch.relu(out)
a_probs = self.actor(out)
a_probs = F.softmax(a_probs,dim=-1) #maybe just work with logits?
state_vals = self.critic(out)
return a_probs, state_vals, h
def init_hidden(self):
return torch.rand(1, self.gru_hidden_size).cuda()
```
%% Cell type:code id: tags:
``` python
class ReplayBuffer():
def __init__(self,batch_size = 256, replay_capacity =1000):
self.memory = []
self.batch_size = batch_size
self.replay_capacity = replay_capacity
def __len__(self):
return len(self.memory)
def save(self, experience): # experience: [state, action, new_state, reward]
if len(self.memory)> self.batch_size:
self.memory.pop(0)
self.memory.append(experience)
def sample(self):
if len(self.memory)< self.batch_size:
return random.sample(self.memory, len(self.memory))
return random.sample(self.memory, self.batch_size)
```
%% Cell type:code id: tags:
``` python
a_list = ['nothing','UP','RIGHT','DOWN','LEFT']
# env = envirnoment(torch.tensor([1,1]), torch.tensor([199, 199]) )
# state_size = 21
#
# gamma = 0.99
# agent = ActorCritic(state_size, 5).to(device)
# optim = torch.optim.Adam(agent.parameters(), lr =0.01)
def train_loop(model, env, episodes=1000, ep_length= 100, target_update_timing = 64, epsilon_start= 0.9, epsilon_last = 0.2, epsilon_step = 0.0001):
model.train()
steps = 0
PPO_BUFFER = []
epsilon = epsilon_start
for ep in range(episodes):
print(ep)
print("Epsilon: " +str(epsilon))
ep_reward = 0
if(ep == 10):
ep_length=600
state,_,_ = env.reset()
h = model.init_hidden()
#print(h.shape)
for i in range(ep_length):
#print(env.location)
steps += 1
epsilon = max(epsilon_last, epsilon_start - 0.0003 *steps)
rr = torch.rand(1)
action = 0
if rr < epsilon:
_, values,next_h = model(state.float().to(device),h)
action = torch.randint(0,5,(1,))
else:
#print(main_net(state))
a_porbs, value,next_h = model(state.float().to(device),h)
action = torch.argmax(a_porbs)
#print(action)
next_state, reward, done, _, _ = env.take_action(action)
ep_reward += reward
# print(a_list[action])
# print_alt_maze(*env.location)
PPO_BUFFER.append([state, action, reward, next_state, done, h.detach()])
state = next_state
h = next_h
if steps !=0 and steps % target_update_timing == 0:
train_agent_PPO(model,PPO_BUFFER,gamma)
PPO_BUFFER = []
if i == ep_length-1:
print(env.location)
print(str(ep_reward/i))
if done:
print(env.location)
print(str(ep_reward/i))
break
print(state)
```
%% Cell type:code id: tags:
``` python
def train_agent_PPO(model, PPO_BUFFER, gamma, cycles = 5):
model_copy = ActorCritic(state_size,5).to(device)
model_copy.load_state_dict(model.state_dict())
optim = torch.optim.Adam(model_copy.parameters(), lr =0.01)
criterion = nn.HuberLoss()
# Calculate discounted rewards
discounted_rewards = []
discounted_reward = 0
for experience in reversed(PPO_BUFFER):
if experience[4] == True:
discounted_reward = 0
discounted_reward = experience[2] + gamma * discounted_reward
discounted_rewards.insert(0, discounted_reward)
discounted_reward_tensor = torch.tensor([discounted_rewards],dtype=torch.float).view(64,1).to(device)
#print("DISCOUNT TENSOR" + str(discounted_reward_tensor.shape))
print("DISCOUNT TENSOR" + str(discounted_reward_tensor.shape))
state_tensor = torch.cat([t[0] for t in PPO_BUFFER])
state_tensor = state_tensor.view(-1,21).to(device).float().unsqueeze(0)
#print("STATE TENSOR" + str(state_tensor.shape))
action_tensor = torch.tensor([t[1] for t in PPO_BUFFER])
action_tensor = action_tensor.view(-1,1).to(device)
#print("ACTION TENSOR" + str(action_tensor.shape))
next_state = torch.cat([t[3] for t in PPO_BUFFER])
next_state = next_state.view(-1,21).to(device).float()
#print("NEXT STATE TENSOR" + str(next_state.shape))
hidden_state = torch.cat([t[5] for t in PPO_BUFFER]).unsqueeze(0).to(device)
#print(hidden_state.shape)
#hidden_state = hidden_state.view(2,-1,64).to(device).float()
#print("HIDDEN TENSOR" + str(hidden_state.shape))
#torch.autograd.set_detect_anomaly(True)
for i in range(cycles):
current_policy, _, _ = model(state_tensor.float().to(device),hidden_state)
new_policy, values, _ = model_copy(state_tensor.float().to(device),hidden_state)
advantages = discounted_reward_tensor - values
safety = 0.0001
ratios = (torch.gather(new_policy, 1,action_tensor)+ safety)/ (torch.gather(current_policy, 1,action_tensor)+safety)
#print(values)
actor_loss = -1 *torch.min(advantages * ratios, torch.clamp(ratios, 1-0.2, 1+0.2)*advantages)
critic_loss = criterion(discounted_reward_tensor,values.squeeze(0))
optim.zero_grad()
total_loss = actor_loss + critic_loss
total_loss = total_loss.mean()
total_loss.backward()
optim.step()
#print("Example actor loss:", str(actor_loss.mean()))
#print("Example critic loss:", str(critic_loss.mean()))
model.load_state_dict(model_copy.state_dict())
```
%% Cell type:code id: tags:
``` python
env = envirnoment(torch.tensor([1,1]), torch.tensor([199, 199]) )
state_size = 21
gamma = 0.99
agent = ActorCritic(state_size, 5).to(device)
optim = torch.optim.Adam(agent.parameters(), lr =0.01)
train_loop(agent,env)
```
%% Output
0
Epsilon: 0.9
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_24012/2690743689.py in <module>
6 optim = torch.optim.Adam(agent.parameters(), lr =0.01)
7
----> 8 train_loop(agent,env)
~\AppData\Local\Temp/ipykernel_24012/70129161.py in train_loop(model, env, episodes, ep_length, target_update_timing, epsilon_start, epsilon_last, epsilon_step)
37
38 if rr < epsilon:
---> 39 _, values,next_h = model(state.float().to(device),h)
40 action = torch.randint(0,5,(1,))
41
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
~\AppData\Local\Temp/ipykernel_24012/1708081914.py in forward(self, x, h)
13 def forward(self, x, h):
14
---> 15 out, h = self.lstm(x, h)
16 if out.dim()==3:
17 out = out.squeeze(0)
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
748 else:
749 if hx[0].dim() != 2 or hx[1].dim() != 2:
--> 750 msg = ("For unbatched 2-D input, hx and cx should "
751 f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
752 raise RuntimeError(msg)
IndexError: index 1 is out of bounds for dimension 0 with size 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment