-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
151 lines (123 loc) · 5.4 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Pre-process video data.
This script is meant to be run for a limited number of videos (for benchmarking) and it is not optimized for speed.
First, update the paths of the data in parameters.py ('data_paths'), as well as the ffmpeg and ffprobe executables,
if needed. Also set the gulp_videos variable to True if you want to use the TorchVideoGULP loader.
Then, add all the videos you want to process in a 'videos' folder inside each data_path.
Finally, run this script to pre-process the videos. After running the script, the structure of the data folder will be
as follows:
-- /path/to/data
|-- videos
| |-- video1.mkv (or any other format the original video is in)
| |-- video1.mp4
| |-- video2.mp4
| '-- ...
|
|-- frames
| |-- video1_metadata.txt
| |-- video2_metadata.txt
| '-- ...
| |-- video1
| | |-- 0000000.png
| | |-- 0000001.png
| | |-- ...
| |
| |-- video2
| | |-- 0000000.png
| | |-- 0000001.png
| | '-- ...
| |
| '-- ...
|
|-- audios
| |-- video1.wav
| |-- video2.wav
| '-- ...
|
'-- gulp
|-- data_0.gulp
|-- data_1.gulp
|-- ...
|-- meta_0.gmeta
|-- meta_1.gmeta
'-- ...
"""
import os
from parameters import parameters, path_to_ffmpeg, path_to_ffprobe
os.environ['PATH'] = f"{'/'.join(path_to_ffmpeg.split('/')[:-1])}:{os.environ['PATH']}"
gulp_videos = False # Only set to True if you are planning on using the TorchVideoGULP loader
if gulp_videos:
import gulpio
from gulpio.adapters import AbstractDatasetAdapter
from gulpio.fileio import GulpIngestor
class MyGulpAdapter(AbstractDatasetAdapter):
"""
See examples in https://github.com/achaiah/GulpIO/blob/master/src/main/python/gulpio/adapters.py
"""
def __init__(self, data_path, video_files, frame_size=-1, shm_dir_path='/dev/shm'):
self.data_path = data_path
self.video_files = video_files
self.frame_size = frame_size
self.shm_dir_path = shm_dir_path
def __len__(self):
return len(self.video_files)
def get_bursted_frames(self, vid_file):
with gulpio.utils.temp_dir_for_bursting(self.shm_dir_path) as temp_burst_dir:
frame_paths = gulpio.utils.burst_video_into_frames(vid_file, temp_burst_dir)
frames = list(gulpio.utils.resize_images(frame_paths, self.frame_size))
return frames
def iter_data(self, slice_element=None):
for i, vid_file in enumerate(self.video_files):
frames = self.get_bursted_frames(f'{self.data_path}/videos/{vid_file}')
vid_id = vid_file.split('.')[0]
meta = [{'id': i, 'label': ''}] # Here we would add the correct label if doing classification
result = {'meta': meta,
'frames': frames,
'id': vid_id}
yield result
def main():
for data_path in parameters['data_paths']:
# Create directories if they don't exist
os.makedirs(f'{data_path}/frames', exist_ok=True)
os.makedirs(f'{data_path}/audios', exist_ok=True)
# Get list of all video files
video_files = os.listdir(f'{data_path}/videos/')
# Process each video file
for video in video_files:
# Get the video file name and its extension
video_name, video_ext = os.path.splitext(video)
# 1 - Transform to .mp4 if necessary
if video_ext != '.mp4':
os.system(f'{path_to_ffmpeg} '
f'-i {data_path}/videos/{video} '
'-an -vcodec libx264 -crf 23 '
f'{data_path}/videos/{video_name}.mp4')
# 2 - Extract frames
os.makedirs(f'{data_path}/frames/{video_name}', exist_ok=True)
os.system(f'{path_to_ffmpeg} '
f'-i {data_path}/videos/{video_name}.mp4 '
f'-start_number 0 '
f'{data_path}/frames/{video_name}/%07d.png') # Alternatively, you may want to store as jpg
# Optionally, store metadata such as fps, duration, etc.
os.system(f'{path_to_ffprobe} '
'-v error ' # Suppress warnings (only show errors)
'-select_streams v:0 '
'-show_entries stream=width,height,duration,r_frame_rate '
f'-of csv=s=x:p=0 {data_path}/videos/{video_name}.mp4 '
f'> {data_path}/frames/{video_name}_metadata.txt')
# 3 - Extract audio
os.system(f'{path_to_ffmpeg} '
f'-i {data_path}/videos/{video_name}.mp4 '
'-vn '
'-ac 1 '
f'{data_path}/audios/{video_name}.wav')
# 4 - "Gulp" the videos
if gulp_videos:
gulp_output_folder = f'{data_path}/gulp'
os.makedirs(gulp_output_folder, exist_ok=True)
# See https://github.com/achaiah/GulpIO/blob/master/src/main/scripts/gulp_20bn_json_videos
adapter = MyGulpAdapter(data_path, video_files)
ingestor = GulpIngestor(adapter, gulp_output_folder, videos_per_chunk=100, num_workers=10)
ingestor()
if __name__ == '__main__':
main()