skallweitNV
diff --git a/‎docs/target-compatibility.md
+18 b/‎docs/target-compatibility.md
+18
diff --git a/‎prelude/slang-cuda-prelude.h
+7 b/‎prelude/slang-cuda-prelude.h
+7
diff --git a/‎source/core/slang-test-tool-util.cpp
+35-10 b/‎source/core/slang-test-tool-util.cpp
+35-10
diff --git a/‎source/core/slang-test-tool-util.h
+8 b/‎source/core/slang-test-tool-util.h
+8
diff --git a/‎source/slang/hlsl.meta.slang
+62 b/‎source/slang/hlsl.meta.slang
+62
diff --git a/‎source/slang/slang-check-decl.cpp
+79-12 b/‎source/slang/slang-check-decl.cpp
+79-12
@@ -40,6 +40,7 @@ Items with ^ means there is some discussion about support later in the document
 | Atomics                     |     Yes      |   Yes        |   Yes      |     Yes       |    No + 
 | Atomics on RWBuffer         |     Yes      |   Yes        |   Yes      |     No        |    No + 
 | Sampler Feedback            |     No       |   Yes        |   No +     |     No        |    Yes ^
+| RWByteAddressBuffer Atomic  |     No       |   Yes ^      |   Yes ^    |     Yes       |    No +
 
 ## Half Type
 
@@ -179,3 +180,20 @@ There doesn't not appear to be a similar feature available in Vulkan yet, but wh
 
 For CPU targets there is the IFeedbackTexture interface that requires an implemention for use. Slang does not currently include CPU implementations for texture types.  
 
+## RWByteAddressBuffer Atomic
+
+Currently feature allows atomic float additions on RWByteAddressBuffer. A future update will broader types supported. There are methods on RWByteAddressBuffer...
+
+```
+void RWByteAddressBuffer::InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
+void RWByteAddressBuffer::InterlockedAddFp32(uint byteAddress, float valueToAdd);
+```
+
+On HLSL based targets this functionality is achieved using [nvAPI](https://developer.nvidia.com/nvapi) based functionality. Therefore for the feature to work you must have nvAPI installed on your system. Then the 'prelude' functionality allows via the API for an include (or the text) of the relevent files. To see how to do this in practice look at the function `setSessionDefaultPrelude`. This makes the prelude for HLSL hold an include to the *absolute* path to the required include file `nvHLSLExtns.h`. As an absolute path is used, it means other includes that includes, look in the correct place without having to set up special include paths. 
+
+To use nvAPI it is nessary to specify a unordered access views (UAV) based 'u' register that will be used to communicate with nvAPI. Note! Slang does not do any special handling around this, it will be necessary for application code to ensure the UAV is either guarenteed to not collide with what Slang assigns, or it's specified (but not used) in the Slang source. The u register number has to be specified also to the nvAPI runtime library. 
+
+On Vulkan, the [`GL_EXT_shader_atomic_float`](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_EXT_shader_atomic_float.html) extension is required.
+
+
+
@@ -472,6 +472,13 @@ struct RWByteAddressBuffer
         *(T*)((char*)data + offset) = value;
     }
 
+        /// Can be used in stdlib to gain access
+    SLANG_CUDA_CALL uint* _getPtrAt(size_t offset)
+    {
+        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
+        return (uint*)(((char*)data) + offset);
+    }
+    
     uint32_t* data;
     size_t sizeInBytes; //< Must be multiple of 4 
 };
 
@@ -75,29 +75,54 @@ static SlangResult _addCUDAPrelude(const String& parentPath, slang::IGlobalSessi
     return SLANG_OK;
 }
 
-/* static */SlangResult TestToolUtil::setSessionDefaultPrelude(const char* exePath, slang::IGlobalSession* session)
+/* static */SlangResult TestToolUtil::setSessionDefaultPrelude(const PreludeInfo& info, slang::IGlobalSession* session)
 {
     // Set the prelude to a path
-    String canonicalPath;
-    if (SLANG_SUCCEEDED(Path::getCanonical(exePath, canonicalPath)))
+    if (info.exePath)
     {
-        // Get the directory
-        String parentPath = Path::getParentDirectory(canonicalPath);
+        String exePath(info.exePath);
 
-        if (SLANG_FAILED(_addCPPPrelude(parentPath, session)))
+        String canonicalPath;
+        if (SLANG_SUCCEEDED(Path::getCanonical(exePath, canonicalPath)))
         {
-            SLANG_ASSERT(!"Couldn't find the C++ prelude relative to the executable");
+            // Get the directory
+            String parentPath = Path::getParentDirectory(canonicalPath);
+
+            if (SLANG_FAILED(_addCPPPrelude(parentPath, session)))
+            {
+                SLANG_ASSERT(!"Couldn't find the C++ prelude relative to the executable");
+            }
+
+            if (SLANG_FAILED(_addCUDAPrelude(parentPath, session)))
+            {
+                SLANG_ASSERT(!"Couldn't find the CUDA prelude relative to the executable");
+            }
         }
-
-        if (SLANG_FAILED(_addCUDAPrelude(parentPath, session)))
+    }
+    // If the nvAPI path is set, and we find nvHLSLExtns.h, put that in the HLSL prelude
+    if (info.nvAPIPath)
+    {
+        String includePath;
+        if (SLANG_SUCCEEDED(_calcIncludePath(info.nvAPIPath, "nvHLSLExtns.h", includePath)))
         {
-            SLANG_ASSERT(!"Couldn't find the CUDA prelude relative to the executable");
+            StringBuilder buf;
+
+            buf << "#include \"" << includePath << "\"\n";
+
+            session->setLanguagePrelude(SLANG_SOURCE_LANGUAGE_HLSL, buf.getBuffer());
+            return SLANG_OK;
         }
     }
 
     return SLANG_OK;
 }
 
+/* static */SlangResult TestToolUtil::setSessionDefaultPrelude(const char* exePath, slang::IGlobalSession* session)
+{
+    PreludeInfo info;
+    info.exePath = exePath;
+    return setSessionDefaultPrelude(info, session);
+}
 
 }
 
@@ -36,6 +36,12 @@ enum class ToolReturnCodeSpan
 /* Utility functions for 'test tools' */
 struct TestToolUtil
 {
+    struct PreludeInfo
+    {
+        const char* exePath = nullptr;
+        const char* nvAPIPath = nullptr;
+    };
+
     typedef SlangResult(*InnerMainFunc)(Slang::StdWriters* stdWriters, SlangSession* session, int argc, const char*const* argv);
 
         /// If the test failed to run or was ignored then we are done
@@ -48,6 +54,8 @@ struct TestToolUtil
     static ToolReturnCode getReturnCode(SlangResult res);
 
         /// Sets the default preludes on the session based on the executable path
+    static SlangResult setSessionDefaultPrelude(const PreludeInfo& preludeInfo, slang::IGlobalSession* session);
+
     static SlangResult setSessionDefaultPrelude(const char* exePath, slang::IGlobalSession* session);
 };
 
 
@@ -48,6 +48,29 @@ struct ByteAddressBuffer
     }
 };
 
+
+// Make the GLSL atomicAdd available.
+// We have separate int/float implementations, as the float version requires some specific extensions
+// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt
+
+__target_intrinsic(glsl, "atomicAdd($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_float)
+//__glsl_extension(GL_EXT_gpu_shader5)
+float __atomicAdd(__ref float value, float amount);
+
+// Int versions require glsl 4.30
+// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
+
+__target_intrinsic(glsl, "atomicAdd($0, $1)")
+__glsl_version(430)
+int __atomicAdd(__ref int value, int amount);
+
+__target_intrinsic(glsl, "atomicAdd($0, $1)")
+__glsl_version(430)
+uint __atomicAdd(__ref uint value, uint amount);
+
+
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
 
@@ -159,6 +182,41 @@ struct $(item.name)
     {
         return __byteAddressBufferLoad<T>(this, location);
     }
+${{{{
+    if (item.op == kIROp_HLSLRWByteAddressBufferType)
+    {
+}}}}
+
+    // float32 and int64 atomic support. This is a Slang specific extension, it uses
+    // GL_EXT_shader_atomic_float on vk
+    // NvAPI support on DX
+    // NOTE! To use this feature on HLSL, the shader needs to include 'nvHLSLExtns.h' from the NvAPI SDK
+    //
+    __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
+    __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
+    void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
+
+    __specialized_for_target(glsl)
+    void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue)
+    {
+        RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
+        originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
+    }
+
+    __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
+    __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
+    void InterlockedAddFp32(uint byteAddress, float valueToAdd);
+
+    __specialized_for_target(glsl)
+    void InterlockedAddFp32(uint byteAddress, float valueToAdd)
+    {
+        RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
+        __atomicAdd(buf[byteAddress / 4], valueToAdd);
+    }
+
+${{{{
+    }
+}}}}
 
     // Added operations:
 
@@ -1091,6 +1149,10 @@ T dot(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType> vector<T,4> dst(vector<T,4> x, vector<T,4> y);
 
+// Given a RWByteAddressBuffer allow it to be interpretted as a RWStructuredBuffer
+__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer))
+RWStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RWByteAddressBuffer b);
+
 // Error message
 
 // void errorf( string format, ... );
 
@@ -3520,6 +3520,7 @@ namespace Slang
         return subst;
     }
 
+#if 0
     // For simplicity we will make having a definition of a function include having a body or a target intrinsics defined.
     // It may be useful to add other modifiers to mark as having body - for example perhaps
     // any target intrinsic modifier (like SPIR-V version) should be included.
@@ -3536,6 +3537,40 @@ namespace Slang
     {
         return decl->body || decl->hasModifier<TargetIntrinsicModifier>();
     }
+#endif
+
+    typedef Dictionary<Name*, CallableDecl*> TargetDeclDictionary;
+
+    static void _addTargetModifiers(CallableDecl* decl, TargetDeclDictionary& ioDict)
+    {
+        if (auto specializedModifier = decl->findModifier<SpecializedForTargetModifier>())
+        {
+            // If it's specialized for target it should have a body...
+            if (auto funcDecl = as<FunctionDeclBase>(decl))
+            {
+                SLANG_ASSERT(funcDecl->body);
+            }
+            Name* targetName = specializedModifier->targetToken.getName();
+
+            ioDict.AddIfNotExists(targetName, decl);
+        }
+        else
+        {
+            for (auto modifier : decl->getModifiersOfType<TargetIntrinsicModifier>())
+            {
+                Name* targetName = modifier->targetToken.getName();
+                ioDict.AddIfNotExists(targetName, decl);
+            }
+
+            auto funcDecl = as<FunctionDeclBase>(decl);
+            if (funcDecl && funcDecl->body)
+            {
+                // Should only be one body if it isn't specialized for target.
+                // Use nullptr for this scenario
+                ioDict.AddIfNotExists(nullptr, decl);
+            }
+        }  
+    }
 
     Result SemanticsVisitor::checkFuncRedeclaration(
         FuncDecl* newDecl,
@@ -3701,23 +3736,55 @@ namespace Slang
         // with the case where the two function declarations
         // might represent different target-specific versions
         // of a function.
-        //
-        // TODO: if the two declarations are specialized for
-        // different targets, then skip the body checks below.
-        //
-        // ???: Why isn't this problem showing up in practice?
-
+       
         // If both of the declarations have a body, then there
         // is trouble, because we wouldn't know which one to
         // use during code generation.
-        if (_isDefinition(newDecl) && _isDefinition(oldDecl))
+
+        // Here to cover the 'bodies'/target_intrinsics, we find all the targets that
+        // that are previously defined, and make sure the new definition
+        // doesn't try and define what is already defined.
         {
-            // Redefinition
-            getSink()->diagnose(newDecl, Diagnostics::functionRedefinition, newDecl->getName());
-            getSink()->diagnose(oldDecl, Diagnostics::seePreviousDefinitionOf, newDecl->getName());
+            TargetDeclDictionary currentTargets;
+            {
+                CallableDecl* curDecl = newDecl->primaryDecl;
+                while (curDecl)
+                {
+                    if (curDecl != newDecl)
+                    {
+                        _addTargetModifiers(curDecl, currentTargets);
+                    }
+                    curDecl = curDecl->nextDecl;
+                }
+            }
 
-            // Don't bother emitting other errors
-            return SLANG_FAIL;
+            // Add the targets for this new decl
+            TargetDeclDictionary newTargets;
+            _addTargetModifiers(newDecl, newTargets);
+
+            bool hasConflict = false;
+            for (auto& pair : newTargets)
+            {
+                Name* target = pair.Key;
+                auto found = currentTargets.TryGetValue(target);
+                if (found)
+                {
+                    // Redefinition
+                    if (!hasConflict)
+                    {
+                        getSink()->diagnose(newDecl, Diagnostics::functionRedefinition, newDecl->getName());
+                        hasConflict = true;
+                    }
+
+                    auto prevDecl = *found;
+                    getSink()->diagnose(prevDecl, Diagnostics::seePreviousDefinitionOf, prevDecl->getName());
+                }
+            }
+
+            if (hasConflict)
+            {
+                return SLANG_FAIL;
+            }
         }
 
         // At this point we've processed the redeclaration and