From 5d70717f25df3c1247b72155c488266a218f4444 Mon Sep 17 00:00:00 2001
From: "Tang, Jiajun" <jiajun.tang@intel.com>
Date: Thu, 26 Oct 2023 11:20:21 +0800
Subject: [PATCH] [SYCLomatic #1361] Add 24 math API tests.

Signed-off-by: Tang, Jiajun jiajun.tang@intel.com
---
 features/feature_case/math/math-emu-double.cu | 278 +++++++++++++++++
 features/feature_case/math/math-emu-float.cu  | 280 ++++++++++++++++++
 features/feature_case/math/math-ext-double.cu | 278 +++++++++++++++++
 features/feature_case/math/math-ext-float.cu  | 278 +++++++++++++++++
 4 files changed, 1114 insertions(+)

diff --git a/features/feature_case/math/math-emu-double.cu b/features/feature_case/math/math-emu-double.cu
index a36e971e6..fa3c81987 100644
--- a/features/feature_case/math/math-emu-double.cu
+++ b/features/feature_case/math/math-emu-double.cu
@@ -256,6 +256,200 @@ void testRnorm4dCases(const vector<pair<d_tuple4, di_pair>> &TestCases) {
   }
 }
 
+// Double Precision Intrinsics
+
+__global__ void dadd_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rd(Input1, Input2);
+}
+
+void testDadd_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rn(Input1, Input2);
+}
+
+void testDadd_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_ru(Input1, Input2);
+}
+
+void testDadd_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rz(Input1, Input2);
+}
+
+void testDadd_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rd(Input1, Input2);
+}
+
+void testDmul_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rn(Input1, Input2);
+}
+
+void testDmul_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_ru(Input1, Input2);
+}
+
+void testDmul_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rz(Input1, Input2);
+}
+
+void testDmul_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rd(Input1, Input2);
+}
+
+void testDsub_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rn(Input1, Input2);
+}
+
+void testDsub_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_ru(Input1, Input2);
+}
+
+void testDsub_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rz(Input1, Input2);
+}
+
+void testDsub_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
 int main() {
   testNormCases({
       {{-0.3, -0.34, -0.98}, {1.079814798935447, 15}},
@@ -306,6 +500,90 @@ int main() {
       {{0.5, 456, 23, 1}, {0.002190191670280358, 18}},
       {{23, 432, 23, 1}, {0.002308274913317669, 18}},
   });
+  testDadd_rdCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_rnCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_ruCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_rzCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDmul_rdCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_rnCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_ruCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_rzCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDsub_rdCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_rnCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_ruCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_rzCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
   cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
   if (failed) {
     cout << "failed!" << endl;
diff --git a/features/feature_case/math/math-emu-float.cu b/features/feature_case/math/math-emu-float.cu
index 77b5938e1..ea08507c1 100644
--- a/features/feature_case/math/math-emu-float.cu
+++ b/features/feature_case/math/math-emu-float.cu
@@ -46,6 +46,8 @@ void checkResult(const string &FuncName, const vector<T> &Inputs,
   check(abs(DeviceResult - Expect) < pow(10, -precision));
 }
 
+// Single Precision Mathematical Functions
+
 __global__ void expf(float *const Result, float Input1) {
   *Result = expf(Input1);
 }
@@ -271,6 +273,8 @@ void testRnormfCases(const vector<pair<f_vector, fi_pair>> &TestCases) {
   }
 }
 
+// Single Precision Intrinsics
+
 __global__ void _expf(float *const Result, float Input1) {
   *Result = __expf(Input1);
 }
@@ -286,6 +290,198 @@ void test_ExpfCases(const vector<pair<float, fi_pair>> &TestCases) {
   }
 }
 
+__global__ void fadd_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rd(Input1, Input2);
+}
+
+void testFadd_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rn(Input1, Input2);
+}
+
+void testFadd_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_ru(Input1, Input2);
+}
+
+void testFadd_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rz(Input1, Input2);
+}
+
+void testFadd_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rd(Input1, Input2);
+}
+
+void testFmul_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rn(Input1, Input2);
+}
+
+void testFmul_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_ru(Input1, Input2);
+}
+
+void testFmul_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rz(Input1, Input2);
+}
+
+void testFmul_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rd(Input1, Input2);
+}
+
+void testFsub_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rn(Input1, Input2);
+}
+
+void testFsub_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_ru(Input1, Input2);
+}
+
+void testFsub_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rz(Input1, Input2);
+}
+
+void testFsub_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
 int main() {
   testExpfCases({
       {-0.3, {0.7408, 4}},
@@ -348,6 +544,90 @@ int main() {
       {23, {9745000000, -6}},
       {-12, {0.000006144, 9}},
   });
+  testFadd_rdCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7, 7}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_rnCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_ruCases({
+      {{-0.3, -0.4}, {-0.7, 7}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_rzCases({
+      {{-0.3, -0.4}, {-0.7, 7}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7, 7}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFmul_rdCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000001, 8}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_rnCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_ruCases({
+      {{-0.3, -0.4}, {0.12000001, 8}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000001, 8}},
+      {{0.3, 0.8}, {0.24, 7}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_rzCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFsub_rdCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7, 7}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_rnCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_ruCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_rzCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7, 7}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
   cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
   if (failed) {
     cout << "failed!" << endl;
diff --git a/features/feature_case/math/math-ext-double.cu b/features/feature_case/math/math-ext-double.cu
index f9ca2cbb7..2ae500f47 100644
--- a/features/feature_case/math/math-ext-double.cu
+++ b/features/feature_case/math/math-ext-double.cu
@@ -310,6 +310,200 @@ void testY1Cases(const vector<pair<double, di_pair>> &TestCases) {
   }
 }
 
+// Double Precision Intrinsics
+
+__global__ void dadd_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rd(Input1, Input2);
+}
+
+void testDadd_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rn(Input1, Input2);
+}
+
+void testDadd_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_ru(Input1, Input2);
+}
+
+void testDadd_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dadd_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dadd_rz(Input1, Input2);
+}
+
+void testDadd_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dadd_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dadd_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rd(Input1, Input2);
+}
+
+void testDmul_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rn(Input1, Input2);
+}
+
+void testDmul_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_ru(Input1, Input2);
+}
+
+void testDmul_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dmul_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dmul_rz(Input1, Input2);
+}
+
+void testDmul_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dmul_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dmul_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rd(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rd(Input1, Input2);
+}
+
+void testDsub_rdCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rn(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rn(Input1, Input2);
+}
+
+void testDsub_rnCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_ru(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_ru(Input1, Input2);
+}
+
+void testDsub_ruCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void dsub_rz(float *const Result, float Input1, float Input2) {
+  *Result = __dsub_rz(Input1, Input2);
+}
+
+void testDsub_rzCases(
+    const vector<pair<pair<float, float>, di_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    dsub_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__dsub_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
 int main() {
   testCylBesselI0Cases({
       {0.3, {1.022626879351597, 15}},
@@ -389,6 +583,90 @@ int main() {
       {1.6, {-0.3475780082651325, 16}},
       {5, {0.1478631433912269, 16}},
   });
+  testDadd_rdCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_rnCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_ruCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDadd_rzCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testDmul_rdCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_rnCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_ruCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDmul_rzCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testDsub_rdCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_rnCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_ruCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testDsub_rzCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
   cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
   if (failed) {
     cout << "failed!" << endl;
diff --git a/features/feature_case/math/math-ext-float.cu b/features/feature_case/math/math-ext-float.cu
index 8df738123..2699cfba1 100644
--- a/features/feature_case/math/math-ext-float.cu
+++ b/features/feature_case/math/math-ext-float.cu
@@ -299,6 +299,8 @@ __global__ void _y1f(float *const Result, float Input1) {
   *Result = y1f(Input1);
 }
 
+// Single Precision Intrinsics
+
 void testY1fCases(const vector<pair<float, fi_pair>> &TestCases) {
   float *Result;
   cudaMallocManaged(&Result, sizeof(*Result));
@@ -310,6 +312,198 @@ void testY1fCases(const vector<pair<float, fi_pair>> &TestCases) {
   }
 }
 
+__global__ void fadd_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rd(Input1, Input2);
+}
+
+void testFadd_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rn(Input1, Input2);
+}
+
+void testFadd_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_ru(Input1, Input2);
+}
+
+void testFadd_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fadd_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fadd_rz(Input1, Input2);
+}
+
+void testFadd_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fadd_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fadd_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rd(Input1, Input2);
+}
+
+void testFmul_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rn(Input1, Input2);
+}
+
+void testFmul_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_ru(Input1, Input2);
+}
+
+void testFmul_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fmul_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fmul_rz(Input1, Input2);
+}
+
+void testFmul_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fmul_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fmul_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rd(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rd(Input1, Input2);
+}
+
+void testFsub_rdCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rd", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rn(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rn(Input1, Input2);
+}
+
+void testFsub_rnCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rn", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_ru(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_ru(Input1, Input2);
+}
+
+void testFsub_ruCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_ru<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_ru", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void fsub_rz(float *const Result, float Input1, float Input2) {
+  *Result = __fsub_rz(Input1, Input2);
+}
+
+void testFsub_rzCases(
+    const vector<pair<pair<float, float>, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    fsub_rz<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__fsub_rz", {TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
 int main() {
   testCylBesselI0fCases({
       {0.3, {1.022626876831055, 15}},
@@ -389,6 +583,90 @@ int main() {
       {1.6, {-0.3475780, 7}},
       {5, {0.1478631347417831, 16}},
   });
+  testFadd_rdCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.699999988079071, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_rnCases({
+      {{-0.3, -0.4}, {-0.7000000476837158, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_ruCases({
+      {{-0.3, -0.4}, {-0.699999988079071, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFadd_rzCases({
+      {{-0.3, -0.4}, {-0.699999988079071, 16}},
+      {{0.3, -0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.4}, {0.699999988079071, 16}},
+      {{0.3, 0.8}, {1.100000023841858, 15}},
+      {{3, 4}, {7, 15}},
+  });
+  testFmul_rdCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000001221895218, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_rnCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_ruCases({
+      {{-0.3, -0.4}, {0.12000001221895218, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000001221895218, 17}},
+      {{0.3, 0.8}, {0.2400000244379044, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFmul_rzCases({
+      {{-0.3, -0.4}, {0.12000000476837158, 17}},
+      {{0.3, -0.4}, {-0.12000000476837158, 17}},
+      {{0.3, 0.4}, {0.12000000476837158, 17}},
+      {{0.3, 0.8}, {0.2400000095367432, 16}},
+      {{3, 4}, {12, 15}},
+  });
+  testFsub_rdCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.699999988079071, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_rnCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_ruCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.7000000476837158, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
+  testFsub_rzCases({
+      {{-0.3, -0.4}, {0.09999999403953552, 17}},
+      {{0.3, -0.4}, {0.699999988079071, 16}},
+      {{0.3, 0.4}, {-0.09999999403953552, 17}},
+      {{0.3, 0.8}, {-0.5, 16}},
+      {{3, 4}, {-1, 15}},
+  });
   cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
   if (failed) {
     cout << "failed!" << endl;