-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq-learning.py
100 lines (72 loc) · 3.49 KB
/
q-learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gymnasium as gym
from gymnasium.wrappers.time_limit import TimeLimit
import numpy as np
from matplotlib import pyplot as plt
import os
def main():
env = gym.make("CartPole-v1")
actions = [0,1] # Action space. 0 - Pushes car to the left, 1 - Pushes cart to the right
bc_x,bc_x_dot,bc_theta,bc_theta_dot = 2,1,6,12 # Number of bins per state value (Bin counts)
# Creating bins
x_bins = np.linspace(-4.8,4.8,bc_x-1)
x_dot_bins = np.linspace(-0.5,0.5,bc_x_dot-1) # -0.5 to 0.5 was chosen simply through observing the cart. It doesn't seem to go beyond these values often.
theta_bins = np.linspace(-0.418,0.418,bc_theta-1)
theta_dot_bins = np.linspace(-1,1,bc_theta_dot-1) # Same story here.
bins = (x_bins,x_dot_bins,theta_bins,theta_dot_bins)
observation,_ = env.reset()
# Discretize the observation data
state = tuple([np.digitize(observation[i],bins[i]) for i in range(len(observation))])
gamma = 0.99
alpha_start = 0.1
alpha_end = 0.1
alpha_end_episode = 200
epsilon_start = 1
epsilon_end = 0.1
epsilon_end_episode = 200
max_episode_count = 1000 # Max episode count
# The Q-Table of values. e.g. Q[S][A]
Q_table = {}
# Initialize all Q(s,a) to 0
for x in range(bc_x):
for x_dot in range(bc_x_dot):
for theta in range(bc_theta):
for theta_dot in range(bc_theta_dot):
Q_table[(x,x_dot,theta,theta_dot)] = {}
for a in actions:
Q_table[(x,x_dot,theta,theta_dot)][a] = 0
previous_total_rewards = [] # History of reward per episode
for episode in range(max_episode_count):
#alpha = max(min_alpha,alpha/(1+alpha_decay*episode))
alpha = max(episode*((alpha_end-alpha_start)/alpha_end_episode)+alpha_start, alpha_end)
#epsilon = max(min_epsilon,epsilon/(1+epsilon_decay*episode))
epsilon = max(episode*((epsilon_end-epsilon_start)/epsilon_end_episode)+epsilon_start, epsilon_end)
observation,_ = env.reset() # x x_dot theta theta_dot (Initialize S)
# Discretize the observation data
state = tuple([np.digitize(observation[i],bins[i]) for i in range(len(observation))])
done = False
truncated = False
total_reward = 0
while not done and not truncated:
# Choose A from S using policy derived from Q (e.g. epsilon-greedy)
if np.random.random() < epsilon:
action = np.random.choice(actions)
else:
action = max(Q_table[state],key=Q_table[state].get)
# Take action A, observe R, S'
next_observation,reward,done,truncated,_ = env.step(action)
total_reward += reward
# Discretize the observation data
next_state = tuple([np.digitize(next_observation[i],bins[i]) for i in range(len(next_observation))])
# Q(S,A) <- Q(S,A) + alpha*( R + gamma*maxQ(S',a) - Q(S,A) )
Q_table[state][action] = Q_table[state][action]+alpha*(reward + gamma*max(Q_table[next_state].values()) - Q_table[state][action])
state = next_state # S <- S'
previous_total_rewards.append(total_reward)
if len(previous_total_rewards) > 100:
print(sum(previous_total_rewards[-100:])/100,epsilon,episode,total_reward)
#plt.title("Q-Learning")
#plt.xlabel("Episode")
#plt.ylabel("Reward")
#plt.plot(previous_total_rewards)
#plt.show()
if (__name__ == "__main__"):
main()