From d0999451eb582519e4548bc5f50f6e0e6cb51d5b Mon Sep 17 00:00:00 2001
From: Vivek Panyam <vip@uber.com>
Date: Wed, 27 May 2020 00:15:30 -0400
Subject: [PATCH] [WIP][Torch] Seeds / Determinism

---
 .../backends/torchscript/torch_backend.cc       |  9 +++++++++
 source/neuropod/multiprocess/multiprocess.cc    |  5 +++++
 .../multiprocess/multiprocess_worker.cc         |  7 ++++++-
 source/neuropod/multiprocess/ope_load_config.cc |  8 ++++++++
 source/neuropod/multiprocess/ope_load_config.hh |  5 +++++
 source/neuropod/options.hh                      | 17 +++++++++++++++++
 6 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/source/neuropod/backends/torchscript/torch_backend.cc b/source/neuropod/backends/torchscript/torch_backend.cc
index da19d735..b21f2662 100644
--- a/source/neuropod/backends/torchscript/torch_backend.cc
+++ b/source/neuropod/backends/torchscript/torch_backend.cc
@@ -233,6 +233,9 @@ TorchNeuropodBackend::TorchNeuropodBackend(const std::string &neuropod_path, con
 
 void TorchNeuropodBackend::load_model_internal()
 {
+    at::globalContext().setDeterministicCuDNN(options_.torch_cudnn_deterministic);
+    at::globalContext().setBenchmarkCuDNN(!options_.torch_cudnn_benchmark);
+
     // Get the model from the neuropod
     auto graph_stream = loader_->get_istream_for_file("0/data/model.pt");
 
@@ -297,6 +300,12 @@ std::unique_ptr<NeuropodValueMap> TorchNeuropodBackend::infer_internal(const Neu
 {
     torch::NoGradGuard guard;
 
+    // Seed if we need to
+    if (options_.seed >= 0)
+    {
+        torch::manual_seed(options_.seed);
+    }
+
     // Get inference schema
     const auto &method    = model_->get_method("forward");
     const auto &schema    = SCHEMA(method);
diff --git a/source/neuropod/multiprocess/multiprocess.cc b/source/neuropod/multiprocess/multiprocess.cc
index b5f68107..f08631fa 100644
--- a/source/neuropod/multiprocess/multiprocess.cc
+++ b/source/neuropod/multiprocess/multiprocess.cc
@@ -190,6 +190,11 @@ class MultiprocessNeuropodBackend : public NeuropodBackendWithDefaultAllocator<S
         load_config_.neuropod_path             = neuropod_path_;
         load_config_.default_backend_overrides = default_backend_overrides;
 
+        // Options we want to pass to the worker process
+        load_config_.seed                      = options_.seed;
+        load_config_.torch_cudnn_deterministic = options_.torch_cudnn_deterministic;
+        load_config_.torch_cudnn_benchmark     = options_.torch_cudnn_benchmark;
+
         if (options.load_model_at_construction)
         {
             load_model();
diff --git a/source/neuropod/multiprocess/multiprocess_worker.cc b/source/neuropod/multiprocess/multiprocess_worker.cc
index a80ac627..d9a23b3b 100644
--- a/source/neuropod/multiprocess/multiprocess_worker.cc
+++ b/source/neuropod/multiprocess/multiprocess_worker.cc
@@ -57,8 +57,13 @@ void multiprocess_worker_loop(const std::string &control_queue_name)
                 ope_load_config config;
                 received.get(config);
 
+                RuntimeOptions opts;
+                opts.seed                      = config.seed;
+                opts.torch_cudnn_deterministic = config.torch_cudnn_deterministic;
+                opts.torch_cudnn_benchmark     = config.torch_cudnn_benchmark;
+
                 // Load a neuropod
-                neuropod  = stdx::make_unique<Neuropod>(config.neuropod_path, config.default_backend_overrides);
+                neuropod  = stdx::make_unique<Neuropod>(config.neuropod_path, config.default_backend_overrides, opts);
                 allocator = neuropod->get_tensor_allocator();
                 inputs.clear();
                 control_channel.send_message(LOAD_SUCCESS);
diff --git a/source/neuropod/multiprocess/ope_load_config.cc b/source/neuropod/multiprocess/ope_load_config.cc
index 0c88078e..b1882966 100644
--- a/source/neuropod/multiprocess/ope_load_config.cc
+++ b/source/neuropod/multiprocess/ope_load_config.cc
@@ -40,6 +40,10 @@ void ipc_serialize(std::ostream &out, const ope_load_config &data)
 {
     ipc_serialize(out, data.neuropod_path);
     ipc_serialize(out, data.default_backend_overrides);
+
+    ipc_serialize(out, data.seed);
+    ipc_serialize(out, data.torch_cudnn_deterministic);
+    ipc_serialize(out, data.torch_cudnn_benchmark);
 }
 
 template <>
@@ -47,6 +51,10 @@ void ipc_deserialize(std::istream &in, ope_load_config &data)
 {
     ipc_deserialize(in, data.neuropod_path);
     ipc_deserialize(in, data.default_backend_overrides);
+
+    ipc_deserialize(in, data.seed);
+    ipc_deserialize(in, data.torch_cudnn_deterministic);
+    ipc_deserialize(in, data.torch_cudnn_benchmark);
 }
 
 } // namespace neuropod
diff --git a/source/neuropod/multiprocess/ope_load_config.hh b/source/neuropod/multiprocess/ope_load_config.hh
index 71af4edb..b4121b89 100644
--- a/source/neuropod/multiprocess/ope_load_config.hh
+++ b/source/neuropod/multiprocess/ope_load_config.hh
@@ -29,6 +29,11 @@ struct ope_load_config
 
     // See the docs in `neuropod.hh`
     std::vector<BackendLoadSpec> default_backend_overrides;
+
+    // Fields from options.hh that are relevant to the worker process
+    int64_t seed;
+    bool    torch_cudnn_deterministic;
+    bool    torch_cudnn_benchmark;
 };
 
 // Serialization specializations for ope_load_config
diff --git a/source/neuropod/options.hh b/source/neuropod/options.hh
index 0d1342a0..5c36aa48 100644
--- a/source/neuropod/options.hh
+++ b/source/neuropod/options.hh
@@ -72,6 +72,23 @@ struct RuntimeOptions
     // immediately loading the model. If this is set to `false`, the model will
     // not be loaded until the `load_model` method is called on the Neuropod.
     bool load_model_at_construction = true;
+
+    // EXPERIMENTAL
+    // A seed to use when running a graph
+    // Note: this currently only applies to TorchScript models
+    int64_t seed = -1;
+
+    // EXPERIMENTAL
+    // Whether or not to run in deterministic mode. See https://pytorch.org/docs/stable/notes/randomness.html#cudnn
+    // Note: this currently only applies to TorchScript models and affects all torchscript models in the process.
+    // Should only be used with OPE to avoid this issue.
+    bool torch_cudnn_deterministic = false;
+
+    // EXPERIMENTAL
+    // Whether or not to enable cudnn benchmark. See https://pytorch.org/docs/stable/notes/randomness.html#cudnn
+    // Note: this currently only applies to TorchScript models and affects all torchscript models in the process.
+    // Should only be used with OPE to avoid this issue.
+    bool torch_cudnn_benchmark = false;
 };
 
 } // namespace neuropod