-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathsliding_window_video_test.py
440 lines (324 loc) · 13.3 KB
/
sliding_window_video_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
import os
#change this to your working directory
current_dir = r'/home/abdullah/Documents/graduation_project/codes/'
os.chdir(current_dir + 'real-time-action-recognition')
print("Current working Dorectory: ", os.getcwd())
import time
import torch
import torch.nn.parallel
import torchvision
import cv2
import torchvision.transforms as Transforms
from numpy.random import randint
import operator
import numpy as np
#from matplotlib import pyplot as plt
from Modified_CNN import TSN_model
from transforms import *
import argparse
parser = argparse.ArgumentParser(
description="Standard video-level testing")
parser.add_argument('dataset', type=str, choices=['ucf101', 'hmdb51', 'kinetics'])
parser.add_argument('modality', type=str, choices=['RGB', 'Flow', 'RGBDiff'])
parser.add_argument('weights', type=str)
parser.add_argument('--arch', type=str, default="BNInception")
parser.add_argument('--test_segments', type=int, default=25)
parser.add_argument('--test_crops', type=int, default=1)
parser.add_argument('--input_size', type=int, default=224)
parser.add_argument('--crop_fusion_type', type=str, default='avg',
choices=['avg', 'max', 'topk'])
parser.add_argument('--k', type=int, default=3)
parser.add_argument('--dropout', type=float, default=0.7)
parser.add_argument('--classInd_file', type=str, default='')
parser.add_argument('--video', type=str, default='')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--gpus', nargs='+', type=int, default=None)
args = parser.parse_args()
"""
--------------------Sliding Window ------------------------------------
"""
def sliding_window_aggregation_func(score, spans=[1, 2, 4, 8, 16], overlap=0.2, norm=True, fps=1):
"""
This is the aggregation function used for ActivityNet Challenge 2016
:param score:
:param spans:
:param overlap:
:param norm:
:param fps:
:return:
"""
def softmax(scores_row_vector):
"""
the function takes the softmax of row numpy vector
input:
scores_row_vector: a row vector of of type numpy float32
resturn:
softmax_scores: row vector of of type numpy float32
"""
torch_softmax = torch.nn.Softmax() #across rows
scores_torch = torch.from_numpy(scores_row_vector).float().cuda()
scores_torch = torch_softmax(scores_torch)
return scores_torch.data.cpu().numpy()
frm_max = score.max(axis=1)
slide_score = []
def top_k_pool(scores, k):
return np.sort(scores, axis=0)[-k:, :].mean(axis=0)
for t_span in spans:
span = t_span * fps
step = int(np.ceil(span * (1-overlap)))
local_agg = [frm_max[i: i+span].max(axis=0) for i in range(0, frm_max.shape[0], step)]
k = max(15, len(local_agg)/4)
slide_score.append(top_k_pool(np.array(local_agg), k))
out_score = np.mean(slide_score, axis=0)
if norm:
return softmax(out_score)
else:
return out_score
"""
----------------------------------------------------------------------------
"""
"""
-----------------------------print_topN_action class---------------
"""
class Top_N(object):
client_flag = True #if import_indecies_top_N_scores it will print using the actual
#scores from the server
#if import_scores were called it will print its tops cores
#(slient scores)
def __init__(self, classInd_textfile, N=5):
"""
input:
classInd_textfile: the text file that contains actions
N: is an integer refrering to top N actions
"""
self.N = N
self.actions_list = self.load_actions_label(classInd_textfile)
self.scores = None
self.top_N_scores = None
self.top_N_actions = None
self.indecies = None
def load_actions_label(self, classInd):
"""
returns a list of the actions ordered by the list index
input:
classInd: of type string refering the input textfile
output:
action_label: of type list
"""
action_label_list=[]
with open(classInd) as f:
content = f.readlines()
f.close()
for line in content:
action_label_list.append(line.split(' ')[-1].rstrip()) #splitting the number from the action
#and removing the '/n' using
#rstrip() method
return action_label_list
def import_scores(self, scores):
"""
the method impoetes the socres of the actions
"""
Top_N.client_flag = False
self.scores = scores
def get_top_N_actions(self):
"""
input:
scores: of type numpay 1d array (row vector) that contains actions'
scores
returns a tuple (top N actions' indicies of type numpy array,
list of type N action,
numpy array of top N cores)
return False if the scores were not given
"""
#Handelling no scores' input
try:
if self.scores == None:
return False
except:
pass
sorted_indcies = np.argsort(self.scores)[::-1] #soring the indicies from
#the biggesr to the loewst
sorted_indcies = sorted_indcies[:self.N] #taking top N actioms
top_N_actions = []
top_N_scores = []
for i in sorted_indcies:
top_N_actions.append(self.actions_list[i])
top_N_scores.append(self.scores[i])
self.top_N_scores = top_N_scores
self.top_N_actions = top_N_actions
self.indecies = sorted_indcies
return sorted_indcies, top_N_actions, top_N_scores
def import_indecies_top_N_scores(self, tuple_input):
"""
the method takes a input tuple (indecies, scores)
returns the list of actions' string
"""
Top_N.client_flag = True
self.indecies, self.top_N_scores = tuple_input
def index_to_actionString(self):
"""
the method takes a input tuple (indecies, scores)
returns the list of actions' string
"""
#Handelling no scores' input
try:
if self.indecies == None:
return False
except:
pass
top_N_actions = []
for i in self.indecies:
top_N_actions.append(self.actions_list[i])
self.top_N_actions = top_N_actions
return top_N_actions
def __str__(self):
"""
this method will be activated when doing print(object)
"""
open_statement = "Top " + str(self.N) + " Actions.\n"
#Handelling no scores' input
try:
if self.scores == None and self.indecies == None:
return open_statement + "\nThere is no scores were given."
except:
pass
try:
if self.top_scores == None:
return open_statement + "\nThere is no scores were given."
except:
pass
if Top_N.client_flag: #if import_indecies_top_N_scores is called
self.index_to_actionString()
else: #if import_scores were called
self.get_top_N_actions()
action_satement = ''
for i in range(self.N):
action_satement += self.top_N_actions[i] + " : " \
+ "{0:.4f}".format(self.top_N_scores[i]*100) + '\n'
return open_statement + action_satement
#this function takes one video at a time and outputs the first 5 scores
def one_video():
#this function do forward propagation and returns scores
def eval_video(data):
"""
Evaluate single video
video_data : Tuple has 3 elments (data in shape (crop_number,num_segments*length,H,W), label)
return : predictions and labels
"""
if args.modality == 'RGB':
length = 3
elif args.modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + args.modality)
with torch.no_grad():
#reshape data to be in shape of (num_segments*crop_number,length,H,W)
input = data.view(-1, length, data.size(1), data.size(2))
#Forword Propagation
output = model(input)
#Reshape numpy array to (num_crop,num_segments,num_classes)
output_torch = output.view((num_crop, test_segments, num_class))
#Take mean of cropped images to be in shape (num_segments,1,num_classes)
output_torch = output_torch.mean(dim=0).view((test_segments,1,num_class))
#
return output_torch
#this function used to pick 25 frames only from the whole video
def frames_indices(frames):
FPSeg = len(frames) // test_segments
offset = [x*FPSeg for x in range(test_segments)]
random_indices = list(randint(FPSeg,size=test_segments))
frame_indices = [sum(i) for i in zip(random_indices,offset)]
return frame_indices
num_crop = args.test_crops
test_segments = args.test_segments
"""
--------------------Inzializations---------------------------
"""
top5_actions = Top_N(args.classInd_file, 5)
if args.dataset == 'ucf101':
num_class = 101
else:
raise ValueError('Unkown dataset: ' + args.dataset)
model = TSN_model(num_class, 1, args.modality,
base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
#load the weights of your model training
checkpoint = torch.load(args.weights)
print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1']))
base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
model.load_state_dict(base_dict)
#test_crops is set to 1 for fast video evaluation
if args.test_crops == 1:
cropping = torchvision.transforms.Compose([
GroupScale(model.scale_size),
GroupCenterCrop(model.input_size),
])
elif args.test_crops == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(model.input_size, model.scale_size)
])
else:
raise ValueError("Only 1 and 10 crops are supported while we got {}".format(test_crops))
#Required transformations
transform = torchvision.transforms.Compose([
cropping,
Stack(roll=args.arch == 'BNInception'),
ToTorchFormatTensor(div=args.arch != 'BNInception'),
GroupNormalize(model.input_mean, model.input_std),
])
if args.gpus is not None:
devices = [args.gpus[i] for i in range(args.workers)]
else:
devices = list(range(args.workers))
model = torch.nn.DataParallel(model.cuda(devices[0]), device_ids=devices)
model.eval()
softmax = torch.nn.Softmax() #across rows
"""
--------------Captureing the frames from the video-------------------
"""
frames = []
capture = cv2.VideoCapture(args.video)
frame_count = 0
while True:
ret, orig_frame = capture.read()
if ret is True:
frame = cv2.cvtColor(orig_frame, cv2.COLOR_BGR2RGB)
else:
break
#RGB_frame is used for plotting a frame with text of top-5 scores on it
RGB_frame = frame
#use .fromarray function to be able to apply different data augmentations
frame = Image.fromarray(frame)
frame_count += 1
frames.append(frame)
print(frame_count)
# When everything done, release the capture
capture.release()
#cv2.destroyAllWindows()
#to evaluate the processing time
start_time = time.time()
"""
----------------------------------------------------------------------
"""
'''
images = [cv2.imread(file) for file in glob.glob(args.video + "/*.jpg")]
for frame in images:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
frames.append(frame)
'''
indices = frames_indices(frames)
num_segments = len(indices)
frames_a = operator.itemgetter(*indices)(frames)
frames_a = transform(frames_a).cuda()
scores = torch.zeros((num_segments, 1, 101), dtype=torch.float32).cuda()
scores = eval_video(frames_a)
scores = scores.data.cpu().numpy().copy() #now we got the scores of each segment
out_scores = np.zeros((num_segments, 1, 101), dtype=float)
out_scores = sliding_window_aggregation_func(scores, spans=[1, 4, 8, 16], norm=True, fps=1)
end_time = time.time() - start_time
print("time taken: ", end_time)
top5_actions.import_scores(out_scores) #importing scores to the class
print(top5_actions)
if __name__ == '__main__':
one_video()