-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdistributed.py
74 lines (59 loc) · 2.26 KB
/
distributed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
""" Distributed helpers. """
import os
import functools
import logging
import pickle
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
_LOCAL_PROCESS_GROUP = None
def lanuch_mp_worker(main_worker, config, args):
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, config, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, config, args)
def all_gather(tensors):
"""
All gathers the provided tensors from all processes across machines.
Args:
tensors (list): tensors to perform all gather across all processes in
all machines.
"""
gather_list = []
output_tensor = []
world_size = dist.get_world_size()
for tensor in tensors:
tensor_placeholder = [
torch.ones_like(tensor) for _ in range(world_size)
]
dist.all_gather(tensor_placeholder, tensor, async_op=False)
gather_list.append(tensor_placeholder)
for gathered_tensor in gather_list:
output_tensor.append(torch.cat(gathered_tensor, dim=0))
return output_tensor
def all_reduce(tensors, average=True):
"""
All reduce the provided tensors from all processes across machines.
Args:
tensors (list): tensors to perform all reduce across all processes in
all machines.
average (bool): scales the reduced tensor by the number of overall
processes across all machines.
"""
for tensor in tensors:
dist.all_reduce(tensor, async_op=False)
if average:
world_size = dist.get_world_size()
for tensor in tensors:
tensor.mul_(1.0 / world_size)
return tensors