-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgd.py
260 lines (200 loc) · 9.17 KB
/
sgd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#################################
# Your name: Lior Erenreich
#################################
import numpy as np
import numpy.random
import scipy
from matplotlib import pyplot as plt
from scipy.special import expit
from sklearn.datasets import fetch_openml
import sklearn.preprocessing
"""
Please use the provided function signature for the SGD implementation.
Feel free to add functions and other code, and submit this file with the name sgd.py
"""
def helper():
mnist = fetch_openml('mnist_784', as_frame=False)
data = mnist['data']
labels = mnist['target']
neg, pos = "0", "8"
train_idx = numpy.random.RandomState(0).permutation(np.where((labels[:60000] == neg) | (labels[:60000] == pos))[0])
test_idx = numpy.random.RandomState(0).permutation(np.where((labels[60000:] == neg) | (labels[60000:] == pos))[0])
train_data_unscaled = data[train_idx[:6000], :].astype(float)
train_labels = (labels[train_idx[:6000]] == pos) * 2 - 1
validation_data_unscaled = data[train_idx[6000:], :].astype(float)
validation_labels = (labels[train_idx[6000:]] == pos) * 2 - 1
test_data_unscaled = data[60000 + test_idx, :].astype(float)
test_labels = (labels[60000 + test_idx] == pos) * 2 - 1
# Preprocessing
train_data = sklearn.preprocessing.scale(train_data_unscaled, axis=0, with_std=False)
validation_data = sklearn.preprocessing.scale(validation_data_unscaled, axis=0, with_std=False)
test_data = sklearn.preprocessing.scale(test_data_unscaled, axis=0, with_std=False)
return train_data, train_labels, validation_data, validation_labels, test_data, test_labels
def SGD_hinge(data, labels, C, eta_0, T):
"""
Implements SGD for hinge loss.
:param data: a numpy array of shape (n, d), representing the input data points
:param labels: a numpy array of shape (n,), representing the labels of the input data points
:param C: a float, representing the regularization strength
:param eta_0: a float, representing the learning rate
:param T: an integer, representing the number of iterations to run the algorithm
:return:
w - a numpy array of shape (d,), representing the learned weight vector
accs - a float, representing the accuracy of the learned model
"""
n, d = data.shape
# Initialize the weight vector w and bias b to zero
w = np.zeros(d)
# Run the stochastic gradient descent algorithm for T iterations
for t in range(1, T + 1):
# Select a random index i in the range [0, n)
i = np.random.randint(n)
# Compute the learning rate for the current iteration
eta_t = eta_0 / t
# If the current sample is misclassified, update the weight vector
if labels[i] * np.dot(w, data[i]) < 1:
w = (1 - eta_t) * w + eta_t * C * labels[i] * data[i]
# Otherwise, update the weight vector with the regularization term
else:
w = (1 - eta_t) * w
# Return the learned weight vector and the accuracy
return w, accuracy_score(validation_data, validation_labels, w)
def SGD_log(data, labels, eta_0, T):
"""
Performs stochastic gradient descent to minimize the log-loss objective function
with L2 regularization.
:param data: ndarray of shape (n,d)
:param labels: ndarray of shape (n,)
:param eta_0: initial learning rate
:param T: number of iterations
:return:
- ndarray of shape (d,) - the learned weight vector
- train_losses: list of length T - contains the loss at each iteration
"""
w = np.zeros(data.shape[1])
for t in range(1, T+1):
eta_t = eta_0 / t
# Sample a random point
i = np.random.randint(data.shape[0])
y_i = labels[i]
x_i = data[i]
# Compute the gradient of the objective function at w
grad = -y_i*expit(-y_i*np.dot(w, x_i))*x_i
# Update w with the learning rate
w = (1 - eta_t)*w + eta_t*grad
return w, accuracy_score(validation_data, validation_labels, w)
#################################
# Place for additional code
#################################
def accuracy_score(data, labels, w):
y_predict = np.sign(np.dot(data, w))
accuracy = np.mean(y_predict == labels)
return accuracy
def plot_accuracy_as_function_of_eta(eta_0_range, num_iter, T, C, is_hinge):
"""
Find the best eta_0 to use for SGD on the loss objective function with L2
regularization, using cross-validation on the validation set.
:return:
best_eta_0: float - the best learning rate to use
"""
accuracy_per_eta = []
for eta_0 in eta_0_range:
sum_accuracy = 0
for i in range(num_iter):
if (is_hinge):
w, accuracy = SGD_hinge(train_data, train_labels, C, eta_0, T)
else:
w, accuracy = SGD_log(train_data, train_labels, eta_0, T)
sum_accuracy += accuracy
accuracy_per_eta.append(sum_accuracy / num_iter)
if (is_hinge):
plt.title("1(a) Accuracy of SGD for Hinge loss as a Function of \u03B70")
else:
plt.title("2(a) Accuracy of SGD for Log Loss as a Function of \u03B70")
plt.xlabel('\u03B70')
plt.ylabel('Average Accuracy')
plt.xscale('log')
plt.plot(eta_0_range, accuracy_per_eta, marker='o')
plt.show()
return eta_0_range[np.argmax(accuracy_per_eta)]
def q_1_a(T, eta_0_range, num_iter, C):
return plot_accuracy_as_function_of_eta(eta_0_range, num_iter, T, C, True)
def q_1_b(T, eta_0, num_iter, C_range):
accuracy_to_Cs = []
for C in C_range:
sum_accuracy = 0
for i in range(num_iter):
w, accuracy = SGD_hinge(train_data, train_labels, C, eta_0, T)
sum_accuracy += accuracy
accuracy_to_Cs.append(sum_accuracy / num_iter)
plt.title("1(b) Accuracy of SGD for Hinge Loss as a Function of C")
plt.xlabel('C')
plt.ylabel('Average Validation Accuracy')
plt.semilogx(C_range, accuracy_to_Cs, 'o-')
plt.show()
print(f'1(b) Best C: {C_range[np.argmax(accuracy_to_Cs)]}')
return C_range[np.argmax(accuracy_to_Cs)]
def q_1_c(eta_0, C, T):
w, _ = SGD_hinge(train_data, train_labels, C, eta_0, T)
plt.imshow(np.reshape(w, (28, 28)), interpolation='nearest') # According to the guidance in the assignment.
plt.title('1(c) Resulting w as an image for the best C and \u03B70, T = 20,000')
plt.colorbar()
plt.show()
return w
def q_1_d(w):
return accuracy_score(test_data, test_labels, w)
def q_2_a(T, num_iter, eta_0_range):
return plot_accuracy_as_function_of_eta(eta_0_range, num_iter, T, 1, False)
def q_2_b(T, eta_0):
# Train the classifier using the best eta_0 found in part (a)
w, accuracy = SGD_log(train_data, train_labels, eta_0, T)
# Show the resulting w as an image
plt.imshow(np.reshape(w, (28, 28)), interpolation='nearest') # According to the guidance in the assignment.
plt.colorbar()
plt.title("2(b) Learned Weights")
plt.show()
print("2(b) Accuracy on the test set: {:.4f}%".format(accuracy * 100))
def q_2_c(T, eta_0):
"""
Train the logistic regression classifier for T iterations and plot the norm of w as a function of iteration.
Args:
- T: int, the number of iterations for which to run SGD
- eta_0: float, the learning rate parameter
Returns:
- norm_history: list of floats, the norm of w after each iteration
"""
# initialize w
d = train_data.shape[1]
w = np.zeros(d)
# SGD loop
norm_history = []
for t in range(1, T+1):
eta_t = eta_0 / t
i = np.random.choice(train_data.shape[0])
x_i, y_i = train_data[i], train_labels[i]
y_hat_i = scipy.special.expit(w.dot(x_i))
gradient = (1 - y_hat_i) * y_i * x_i - y_hat_i * x_i
w = w + eta_t * gradient
norm_history.append(np.linalg.norm(w))
# plot norm history
plt.plot(range(1, T+1), norm_history)
plt.xlabel('Iteration')
plt.ylabel('Norm of w')
plt.title('2(c) Norm of w as a function of iteration')
plt.show()
return norm_history
if __name__ == '__main__':
train_data, train_labels, validation_data, validation_labels, test_data, test_labels = helper()
# define range of C values to try
val_range = [10 ** i for i in range(-5, 6)] # η0 = 10^−5, 10^−4, . . . , 10^4, 10^5
num_iter = 10 # 10 runs according to the assignment.
T = 1000 # For 1 (a) and 2 (a) the value of T is 1,000 according to the assignment.
C = 1 # For 1 (a) the value of C is 1 according to the assignment.
best_eta_0 = q_1_a(T, val_range, num_iter, C)
best_c = q_1_b(T, best_eta_0, num_iter, val_range)
Tc = 20000 # For 1 (c) and 2 (b - c) the value of T is 20,000 according to the assignment.
w = q_1_c(best_eta_0, best_c, Tc) # "Using the best C, η0 you found, train the classifier, but for T = 20000".
print(f'1(d) The accuracy of the best classifier on the test is {100*q_1_d(w)}%.')
best_eta_0 = q_2_a(T, num_iter, val_range)
q_2_b(Tc, best_eta_0)
q_2_c(Tc, best_eta_0)