Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
R
Reinforcement CW
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Doga Keskin
Reinforcement CW
Commits
df53bb34
Commit
df53bb34
authored
3 years ago
by
Doga Keskin
Browse files
Options
Downloads
Patches
Plain Diff
FIXED BUGS
parent
6d9d44ea
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
CW.ipynb
+1
-1
1 addition, 1 deletion
CW.ipynb
with
1 addition
and
1 deletion
CW.ipynb
+
1
−
1
View file @
df53bb34
...
...
@@ -420,7 +420,7 @@
"\n",
"\n",
" discounted_reward_tensor = torch.tensor([discounted_rewards],dtype=torch.float).view(64,1).to(device)\n",
"
#
print(\"DISCOUNT TENSOR\" + str(discounted_reward_tensor.shape))\n",
" print(\"DISCOUNT TENSOR\" + str(discounted_reward_tensor.shape))\n",
"\n",
" state_tensor = torch.cat([t[0] for t in PPO_BUFFER])\n",
" state_tensor = state_tensor.view(-1,21).to(device).float().unsqueeze(0)\n",
...
...
%% Cell type:code id: tags:
```
python
import
torch
import
read_maze
as
maze
from
enum
import
Enum
import
torch.optim
import
torch.nn
as
nn
import
numpy
as
np
import
torch.nn.functional
as
F
import
random
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
```
%% Cell type:code id: tags:
```
python
maze
.
load_maze
()
```
%% Cell type:code id: tags:
```
python
class
ActionEnum
(
Enum
):
NOTHING
=
0
UP
=
1
RIGHT
=
2
DOWN
=
3
LEFT
=
4
class
envirnoment
():
def
__init__
(
self
,
start
,
end
):
self
.
start
=
torch
.
clone
(
start
)
self
.
location
=
torch
.
clone
(
start
)
self
.
end
=
torch
.
clone
(
end
)
self
.
around
=
torch
.
from_numpy
(
maze
.
get_local_maze_information
(
*
self
.
location
))
self
.
action_space
=
5
self
.
time_step
=
torch
.
tensor
([
0
])
def
reset
(
self
):
self
.
location
=
torch
.
clone
(
self
.
start
)
self
.
around
=
torch
.
from_numpy
(
maze
.
get_local_maze_information
(
*
self
.
location
))
self
.
time_step
=
torch
.
tensor
([
0
])
state
=
torch
.
cat
([
self
.
location
.
unsqueeze
(
0
),
self
.
around
.
view
(
-
1
,
18
),
self
.
time_step
.
unsqueeze
(
0
)],
dim
=
1
)
return
state
,
self
.
location
,
self
.
around
def
take_action
(
self
,
action
):
# NEXT IS WHERE THE AGENT WILL LAND AFTER PERFORMING AN ACTION
next
=
torch
.
clone
(
self
.
location
)
reward
=
0
if
action
==
1
:
if
self
.
around
[
1
][
0
][
0
]
==
1
:
# and self.around[0][1][1] == 0:
next
[
1
]
+=
-
1
reward
-=
1
#print('up')
elif
action
==
2
:
if
self
.
around
[
2
][
1
][
0
]
==
1
:
# and self.around[1][2][1] == 0:
next
[
0
]
+=
1
reward
+=
1
#print('right')
elif
action
==
3
:
if
self
.
around
[
1
][
2
][
0
]
==
1
:
# and self.around[2][1][1] == 0:
next
[
1
]
+=
1
reward
+=
1
#print('down')
elif
action
==
4
:
if
self
.
around
[
0
][
1
][
0
]
==
1
:
# and self.around[1][0][1] == 0:
next
[
0
]
+=
-
1
reward
-=
1
#print('left')
# IF THE AGENT HAS NOT CHANGED LOCATION THIS ACTION
if
torch
.
equal
(
self
.
location
,
next
):
reward
=
-
0.5
self
.
location
=
torch
.
clone
(
next
)
self
.
around
=
torch
.
from_numpy
(
maze
.
get_local_maze_information
(
*
self
.
location
))
# IF THE AGENT SUCCESSFULLY GETS TO THE GOAL
done
=
False
if
torch
.
equal
(
self
.
location
,
self
.
end
):
reward
=
10
done
=
True
# SUBTRACT TIME BASED PENALTY FROM REWARD
reward
-=
self
.
time_step
*
0.01
self
.
time_step
+=
1
state
=
torch
.
cat
([
self
.
location
.
unsqueeze
(
0
),
self
.
around
.
view
(
-
1
,
18
),
self
.
time_step
.
unsqueeze
(
0
)],
dim
=
1
)
return
state
,
reward
,
done
,
self
.
location
,
self
.
around
```
%% Cell type:code id: tags:
```
python
def
print_maze
(
x
,
y
):
k
=
maze
.
get_local_maze_information
(
x
,
y
)
s
=
''
for
i
in
k
[
0
]:
if
i
[
0
]
==
1
:
s
+=
'
O
'
else
:
s
+=
'
X
'
print
(
s
)
s
=
''
for
i
in
k
[
1
]:
if
i
[
0
]
==
1
:
s
+=
'
O
'
else
:
s
+=
'
X
'
print
(
s
)
s
=
''
for
i
in
k
[
2
]:
if
i
[
0
]
==
1
:
s
+=
'
O
'
else
:
s
+=
'
X
'
print
(
s
)
```
%% Cell type:code id: tags:
```
python
def
print_alt_maze
(
x
,
y
):
k
=
maze
.
get_local_maze_information
(
x
,
y
)
r1
=
''
r2
=
''
r3
=
''
for
i
in
k
:
if
i
[
0
][
0
]
==
1
:
r1
+=
'
O
'
else
:
r1
+=
'
X
'
if
i
[
1
][
0
]
==
1
:
r2
+=
'
O
'
else
:
r2
+=
'
X
'
if
i
[
2
][
0
]
==
1
:
r3
+=
'
O
'
else
:
r3
+=
'
X
'
print
(
'
======
'
)
print
(
r1
)
print
(
r2
)
print
(
r3
)
print
(
'
======
'
)
env2
=
envirnoment
(
torch
.
tensor
([
1
,
1
]),
torch
.
tensor
([
2
,
1
])
)
```
%% Cell type:code id: tags:
```
python
l
=
env2
.
take_action
(
2
)
print
(
env2
.
start
)
print_alt_maze
(
*
env2
.
location
)
print
(
l
[
0
])
print
(
l
[
1
])
# w = torch.cat([l[0].unsqueeze(0), l[1].view(-1,18),env2.time_step.unsqueeze(0)],dim=1)
# print(w)
```
%% Output
tensor([1, 1])
======
X X X
O O O
O X X
======
tensor([[2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])
tensor([10.])
%% Cell type:code id: tags:
```
python
class
ActorCritic
(
nn
.
Module
):
def
__init__
(
self
,
in_size
,
num_of_actions
,
gru_hidden_size
=
64
):
super
(
ActorCritic
,
self
).
__init__
()
self
.
gru_hidden_size
=
gru_hidden_size
self
.
gru
=
nn
.
GRU
(
in_size
,
64
,
1
,
batch_first
=
False
,
bidirectional
=
False
)
self
.
fc1
=
nn
.
Linear
(
64
,
128
)
self
.
actor
=
nn
.
Linear
(
128
,
num_of_actions
)
self
.
critic
=
nn
.
Linear
(
128
,
1
)
def
forward
(
self
,
x
,
h
):
out
,
h
=
self
.
gru
(
x
,
h
)
if
out
.
dim
()
==
3
:
out
=
out
.
squeeze
(
0
)
out
=
torch
.
relu
(
out
)
out
=
self
.
fc1
(
out
)
out
=
torch
.
relu
(
out
)
a_probs
=
self
.
actor
(
out
)
a_probs
=
F
.
softmax
(
a_probs
,
dim
=-
1
)
#maybe just work with logits?
state_vals
=
self
.
critic
(
out
)
return
a_probs
,
state_vals
,
h
def
init_hidden
(
self
):
return
torch
.
rand
(
1
,
self
.
gru_hidden_size
).
cuda
()
```
%% Cell type:code id: tags:
```
python
class
ReplayBuffer
():
def
__init__
(
self
,
batch_size
=
256
,
replay_capacity
=
1000
):
self
.
memory
=
[]
self
.
batch_size
=
batch_size
self
.
replay_capacity
=
replay_capacity
def
__len__
(
self
):
return
len
(
self
.
memory
)
def
save
(
self
,
experience
):
# experience: [state, action, new_state, reward]
if
len
(
self
.
memory
)
>
self
.
batch_size
:
self
.
memory
.
pop
(
0
)
self
.
memory
.
append
(
experience
)
def
sample
(
self
):
if
len
(
self
.
memory
)
<
self
.
batch_size
:
return
random
.
sample
(
self
.
memory
,
len
(
self
.
memory
))
return
random
.
sample
(
self
.
memory
,
self
.
batch_size
)
```
%% Cell type:code id: tags:
```
python
a_list
=
[
'
nothing
'
,
'
UP
'
,
'
RIGHT
'
,
'
DOWN
'
,
'
LEFT
'
]
# env = envirnoment(torch.tensor([1,1]), torch.tensor([199, 199]) )
# state_size = 21
#
# gamma = 0.99
# agent = ActorCritic(state_size, 5).to(device)
# optim = torch.optim.Adam(agent.parameters(), lr =0.01)
def
train_loop
(
model
,
env
,
episodes
=
1000
,
ep_length
=
100
,
target_update_timing
=
64
,
epsilon_start
=
0.9
,
epsilon_last
=
0.2
,
epsilon_step
=
0.0001
):
model
.
train
()
steps
=
0
PPO_BUFFER
=
[]
epsilon
=
epsilon_start
for
ep
in
range
(
episodes
):
print
(
ep
)
print
(
"
Epsilon:
"
+
str
(
epsilon
))
ep_reward
=
0
if
(
ep
==
10
):
ep_length
=
600
state
,
_
,
_
=
env
.
reset
()
h
=
model
.
init_hidden
()
#print(h.shape)
for
i
in
range
(
ep_length
):
#print(env.location)
steps
+=
1
epsilon
=
max
(
epsilon_last
,
epsilon_start
-
0.0003
*
steps
)
rr
=
torch
.
rand
(
1
)
action
=
0
if
rr
<
epsilon
:
_
,
values
,
next_h
=
model
(
state
.
float
().
to
(
device
),
h
)
action
=
torch
.
randint
(
0
,
5
,(
1
,))
else
:
#print(main_net(state))
a_porbs
,
value
,
next_h
=
model
(
state
.
float
().
to
(
device
),
h
)
action
=
torch
.
argmax
(
a_porbs
)
#print(action)
next_state
,
reward
,
done
,
_
,
_
=
env
.
take_action
(
action
)
ep_reward
+=
reward
# print(a_list[action])
# print_alt_maze(*env.location)
PPO_BUFFER
.
append
([
state
,
action
,
reward
,
next_state
,
done
,
h
.
detach
()])
state
=
next_state
h
=
next_h
if
steps
!=
0
and
steps
%
target_update_timing
==
0
:
train_agent_PPO
(
model
,
PPO_BUFFER
,
gamma
)
PPO_BUFFER
=
[]
if
i
==
ep_length
-
1
:
print
(
env
.
location
)
print
(
str
(
ep_reward
/
i
))
if
done
:
print
(
env
.
location
)
print
(
str
(
ep_reward
/
i
))
break
print
(
state
)
```
%% Cell type:code id: tags:
```
python
def
train_agent_PPO
(
model
,
PPO_BUFFER
,
gamma
,
cycles
=
5
):
model_copy
=
ActorCritic
(
state_size
,
5
).
to
(
device
)
model_copy
.
load_state_dict
(
model
.
state_dict
())
optim
=
torch
.
optim
.
Adam
(
model_copy
.
parameters
(),
lr
=
0.01
)
criterion
=
nn
.
HuberLoss
()
# Calculate discounted rewards
discounted_rewards
=
[]
discounted_reward
=
0
for
experience
in
reversed
(
PPO_BUFFER
):
if
experience
[
4
]
==
True
:
discounted_reward
=
0
discounted_reward
=
experience
[
2
]
+
gamma
*
discounted_reward
discounted_rewards
.
insert
(
0
,
discounted_reward
)
discounted_reward_tensor
=
torch
.
tensor
([
discounted_rewards
],
dtype
=
torch
.
float
).
view
(
64
,
1
).
to
(
device
)
#
print("DISCOUNT TENSOR" + str(discounted_reward_tensor.shape))
print
(
"
DISCOUNT TENSOR
"
+
str
(
discounted_reward_tensor
.
shape
))
state_tensor
=
torch
.
cat
([
t
[
0
]
for
t
in
PPO_BUFFER
])
state_tensor
=
state_tensor
.
view
(
-
1
,
21
).
to
(
device
).
float
().
unsqueeze
(
0
)
#print("STATE TENSOR" + str(state_tensor.shape))
action_tensor
=
torch
.
tensor
([
t
[
1
]
for
t
in
PPO_BUFFER
])
action_tensor
=
action_tensor
.
view
(
-
1
,
1
).
to
(
device
)
#print("ACTION TENSOR" + str(action_tensor.shape))
next_state
=
torch
.
cat
([
t
[
3
]
for
t
in
PPO_BUFFER
])
next_state
=
next_state
.
view
(
-
1
,
21
).
to
(
device
).
float
()
#print("NEXT STATE TENSOR" + str(next_state.shape))
hidden_state
=
torch
.
cat
([
t
[
5
]
for
t
in
PPO_BUFFER
]).
unsqueeze
(
0
).
to
(
device
)
#print(hidden_state.shape)
#hidden_state = hidden_state.view(2,-1,64).to(device).float()
#print("HIDDEN TENSOR" + str(hidden_state.shape))
#torch.autograd.set_detect_anomaly(True)
for
i
in
range
(
cycles
):
current_policy
,
_
,
_
=
model
(
state_tensor
.
float
().
to
(
device
),
hidden_state
)
new_policy
,
values
,
_
=
model_copy
(
state_tensor
.
float
().
to
(
device
),
hidden_state
)
advantages
=
discounted_reward_tensor
-
values
safety
=
0.0001
ratios
=
(
torch
.
gather
(
new_policy
,
1
,
action_tensor
)
+
safety
)
/
(
torch
.
gather
(
current_policy
,
1
,
action_tensor
)
+
safety
)
#print(values)
actor_loss
=
-
1
*
torch
.
min
(
advantages
*
ratios
,
torch
.
clamp
(
ratios
,
1
-
0.2
,
1
+
0.2
)
*
advantages
)
critic_loss
=
criterion
(
discounted_reward_tensor
,
values
.
squeeze
(
0
))
optim
.
zero_grad
()
total_loss
=
actor_loss
+
critic_loss
total_loss
=
total_loss
.
mean
()
total_loss
.
backward
()
optim
.
step
()
#print("Example actor loss:", str(actor_loss.mean()))
#print("Example critic loss:", str(critic_loss.mean()))
model
.
load_state_dict
(
model_copy
.
state_dict
())
```
%% Cell type:code id: tags:
```
python
env
=
envirnoment
(
torch
.
tensor
([
1
,
1
]),
torch
.
tensor
([
199
,
199
])
)
state_size
=
21
gamma
=
0.99
agent
=
ActorCritic
(
state_size
,
5
).
to
(
device
)
optim
=
torch
.
optim
.
Adam
(
agent
.
parameters
(),
lr
=
0.01
)
train_loop
(
agent
,
env
)
```
%% Output
0
Epsilon: 0.9
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_24012/2690743689.py in <module>
6 optim = torch.optim.Adam(agent.parameters(), lr =0.01)
7
----> 8 train_loop(agent,env)
~\AppData\Local\Temp/ipykernel_24012/70129161.py in train_loop(model, env, episodes, ep_length, target_update_timing, epsilon_start, epsilon_last, epsilon_step)
37
38 if rr < epsilon:
---> 39 _, values,next_h = model(state.float().to(device),h)
40 action = torch.randint(0,5,(1,))
41
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
~\AppData\Local\Temp/ipykernel_24012/1708081914.py in forward(self, x, h)
13 def forward(self, x, h):
14
---> 15 out, h = self.lstm(x, h)
16 if out.dim()==3:
17 out = out.squeeze(0)
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
~\anaconda3\envs\Reinforcement2\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
748 else:
749 if hx[0].dim() != 2 or hx[1].dim() != 2:
--> 750 msg = ("For unbatched 2-D input, hx and cx should "
751 f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
752 raise RuntimeError(msg)
IndexError: index 1 is out of bounds for dimension 0 with size 1
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment