-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path04_array_job.R
115 lines (96 loc) · 3.57 KB
/
04_array_job.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env Rscript
# usage: Rscript 04_array_job.R $SLURM_ARRAY_TASK_ID
# Fully encapsulated R analysis program which does the following:
# - get the array job number from commandline arguments
# - determine which chunk should be computed by this number
# - load that chunk of the condition grid in memory
# - run the function for this chunk
# - save the output
# - write a log as well!
# last edited 2022-04-04 by @vankesteren
# ODISSEI Social Data Science team
# Logging setup ----
# start with a small logging function
plog <- function(...) cat(format(Sys.time()), "|", ..., "\n")
# log the start of the script
start <- Sys.time()
plog("Starting the abm analysis.")
# Packages ----
plog("Loading packages & ABM code...")
# if you need custom packages installed in your personal library on snellius, run:
# .libPaths("~/R/x86_64-pc-linux-gnu-library/4.3")
suppressPackageStartupMessages(library(tidyverse))
library(parallel)
source("src/schelling_cpp.R")
plog("Packages & ABM code loaded.")
# Load data ----
plog("Loading ABM parameter grid...")
grid_tbl <- read_rds("data_processed/grid_tbl.rds")
plog("Parameter grid loaded.")
# Chunking ----
# first we compute how many models to run on this node
plog("Computing chunks")
# each node has 16 cores
n_cores <- 16
# each core can estimate about 400 models per minute
mod_rate <- 400
# we want to run each job for about 5 minutes
job_time <- 5
# get final chunk size
chunk_size <- n_cores*mod_rate*job_time
# then we get the current task id and assign the right chunk to the current job
task_id <- parse_integer(commandArgs(trailingOnly = TRUE)[1])
n_total <- nrow(grid_tbl)
chunk_start <- ((task_id - 1)*chunk_size + 1)
chunk_end <- min(task_id*chunk_size, n_total)
plog("Running ABM for grid row", chunk_start, "to", chunk_end)
# subset the parameter grid so it's only the chunks we need
grid_tbl <- grid_tbl[chunk_start:chunk_end,]
# Analysis function creation ----
# Again, it really depends on what you are doing.
# I'll output a single number for each row in the grid:
# The simulated proportion of happy nonwestern migrants
analysis_function <- function(row_idx) {
# Get the parameters belonging to this row
settings <- as.list(grid_tbl[row_idx,])
# compute the proportion of happy nonwestern migrants
# use trycatch to avoid crashing. This is important otherwise
# you will have a lot of problems with underused compute!
out <- tryCatch(
# this is the expression to evaluate
expr = {
prop_vec <- c(settings$nl, settings$west, settings$nonwest)
res <- abm_cpp(prop = prop_vec, Ba = settings$Ba)
return(res$h_prop[3])
},
# if there is an error, return NA as output!
error = function(e) return(NA)
)
return(out)
}
# Cluster creation ----
# On the supercomputer, we can make a FORK cluster
# this way we don't have to copy data over to the
# child threads! This is possible for UNIX systems.
# With Fork clusters, cluster nodes have all data
# from the main thread automatically.
plog("Making FORK cluster...")
clus <- makeForkCluster(n_cores)
plog("Cluster successfully created.")
# compute model with load-balancing parallel apply
plog("Running", chunk_size, "ABM simulations...")
out <- parSapplyLB(
cl = clus,
X = 1:(chunk_end-chunk_start+1),
FUN = analysis_function
)
plog("Simulations done!")
# stop the cluster
stopCluster(clus)
# Storing ----
plog("Storing output...")
file_name <- paste0("results_", str_pad(task_id, 5, pad = "0"), ".rds")
write_rds(out, paste0("output/", file_name))
plog("Output stored!")
plog("Elapsed time:", format(Sys.time() - start))
# done, end of script