skallweitNV
diff --git a/‎docs/cpu-target.md
+17 b/‎docs/cpu-target.md
+17
diff --git a/‎source/core/slang-cpp-compiler.h
+2-1 b/‎source/core/slang-cpp-compiler.h
+2-1
diff --git a/‎source/core/slang-random-generator.cpp
+21-3 b/‎source/core/slang-random-generator.cpp
+21-3
diff --git a/‎source/core/slang-random-generator.h
+7-1 b/‎source/core/slang-random-generator.h
+7-1
diff --git a/‎source/core/slang-visual-studio-compiler-util.cpp
+9 b/‎source/core/slang-visual-studio-compiler-util.cpp
+9
diff --git a/‎source/slang/slang-compiler.cpp
+5-1 b/‎source/slang/slang-compiler.cpp
+5-1
diff --git a/‎source/slang/slang-emit-cpp.cpp
+167-54 b/‎source/slang/slang-emit-cpp.cpp
+167-54
diff --git a/‎source/slang/slang-emit-cpp.h
+4 b/‎source/slang/slang-emit-cpp.h
+4
diff --git a/‎tests/compute/array-param.slang
+1-1 b/‎tests/compute/array-param.slang
+1-1
@@ -112,6 +112,23 @@ When compiled into a shared library/dll - how is it invoked? The entry point is
 void computeMain(ComputeVaryingInput* varyingInput, UniformEntryPointParams* uniformParams, UniformState* uniformState);
 ```
 
+
+If compiled with `SLANG_HOST_CALLABLE` the `ISlangSharedLibrary` will export a function named `computeMain` the same name as the entry point in the original source.  
+
+ComputeVaryingInput is defined in the prelude as 
+
+```
+struct ComputeVaryingInput
+{
+    uint3 groupID;
+    uint3 groupThreadID;
+};
+```
+
+Typically when invoking the kernel it is a question of updating the groupID/groupThreadID, to specify which 'thread' of the computation to execute. For the example above we have `[numthreads(4, 1, 1)]`. This means groupThreadID.x can vary from 0-3 and .y and .z must be 0. That groupID.x indicates which 'group of 4' to execute. So groupID.x = 1, with groupThreadID.x=0,1,2,3 runs the 4th, 5th, 6th and 7th 'thread'. Being able to invoke each thread in this way is flexible - in that any specific thread can specified and executed. It is not necessarily very efficient because there is the call overhead and a small amount of extra work that is performed inside the kernel. 
+
+For improved performance there is a mechanism to execute a 'thread group' all in a single invocation. A function with the same signature will be exposed with the entry point name postfixed with `_Group` - in the example above the function would be called 'computeMain_Group'. When calling this function only the groupID need be specified, the groupThreadID is ignored. All of the threads within the group (as specified by `[numthreads]`) will be executed in a single call. 
+
 The UniformState and UniformEntryPointParams struct typically vary by shader. UniformState holds 'normal' bindings, whereas UniformEntryPointParams hold the uniform entry point parameters. Where specific bindings or parameters are located can be determined by reflection. The structures for the example above would be something like the following... 
 
 ```
 
@@ -95,7 +95,8 @@ class CPPCompiler: public RefObject
             enum Enum : Flags
             {
                 EnableExceptionHandling = 0x01,
-                Verbose                 = 0x02, 
+                Verbose                 = 0x02,
+                EnableSecurityChecks    = 0x04,
             };
         };
 
 
@@ -32,15 +32,33 @@ int64_t RandomGenerator::nextInt64()
     return (int64_t(high) << 32) | low;
 }
 
-int32_t RandomGenerator::nextInt32InRange(int32_t min, int32_t max)
+uint32_t RandomGenerator::nextUInt32InRange(uint32_t min, uint32_t max)
 {
-    int32_t diff = max - min;
+    // Make sure max is at least in 
+    max = (max >= min) ? max : min;
+
+    // Make 64 bit so can be lazier than having to take care of 32 bit overflow/underflow issues
+    uint32_t diff = max - min;
     if (diff <= 1)
     {
         return min;
     }
+    return (nextUInt32() % diff) + min;
+}
 
-    return (nextPositiveInt32() % diff) + min;
+
+int32_t RandomGenerator::nextInt32InRange(int32_t min, int32_t max)
+{
+    // Make sure max is at least in 
+    max = (max >= min) ? max : min;
+
+    // Make 64 bit so can be lazier than having to take care of 32 bit overflow/underflow issues
+    uint32_t diff = uint32_t(int64_t(max) - int64_t(min));
+    if (diff <= 1)
+    {
+        return min;
+    }
+    return int32_t(int64_t(nextUInt32() % diff) + min);
 }
 
 int64_t RandomGenerator::nextInt64InRange(int64_t min, int64_t max)
 
@@ -30,6 +30,9 @@ class RandomGenerator: public RefObject
         /// Get the next bool
     virtual bool nextBool();
 
+        /// Next uint32_t
+    uint32_t nextUInt32() { return uint32_t(nextInt32()); }
+
         /// Next Int32 which can only be positive
     int32_t nextPositiveInt32() { return nextInt32() & 0x7fffffff; }
         /// Next Int64 which can only be positive
@@ -38,9 +41,12 @@ class RandomGenerator: public RefObject
         /// Returns value up to BUT NOT INCLUDING maxValue. 
     int32_t nextInt32UpTo(int32_t maxValue) { assert(maxValue > 0); return (maxValue <= 1) ? 0 : (nextPositiveInt32() % maxValue); }
 
-        /// Returns value from min up to BUT NOT INCLUDING max
+        /// Returns value from min up to BUT NOT INCLUDING max. 
     int32_t nextInt32InRange(int32_t min, int32_t max);
 
+    /// Returns value from min up to BUT NOT INCLUDING max
+    uint32_t nextUInt32InRange(uint32_t min, uint32_t max);
+
         /// Returns value up to BUT NOT INCLUDING maxValue
     int64_t nextInt64UpTo(int64_t maxValue) { assert(maxValue > 0); return (maxValue <= 1) ? 0 : (nextPositiveInt64() % maxValue); }
 
 
@@ -95,6 +95,15 @@ namespace Slang
         // Doesn't appear to be a VS equivalent
     }
 
+    if (options.flags & CompileOptions::Flag::EnableSecurityChecks)
+    {
+        cmdLine.addArg("/GS");
+    }
+    else
+    {
+        cmdLine.addArg("/GS-");
+    }
+
     switch (options.debugInfoType)
     {
         default:
 
@@ -1357,11 +1357,15 @@ SlangResult dissassembleDXILUsingDXC(
             }
         }
 
-        CPPCompiler::CompileOptions options;
+        typedef CPPCompiler::CompileOptions CompileOptions;
+        CompileOptions options;
 
         // Set the source type
         options.sourceType = (rawSourceLanguage == SourceLanguage::C) ? CPPCompiler::SourceType::C : CPPCompiler::SourceType::CPP;
 
+        // Disable exceptions and security checks
+        options.flags &= ~(CompileOptions::Flag::EnableExceptionHandling | CompileOptions::Flag::EnableSecurityChecks);
+
         // Generate a path a temporary filename for output module
         String modulePath;
         SLANG_RETURN_ON_FAIL(File::generateTemporary(UnownedStringSlice::fromLiteral("slang-generated"), modulePath));
 
@@ -2463,6 +2463,103 @@ struct GlobalParamInfo
     UInt size;
 };
 
+void CPPSourceEmitter::_emitEntryPointDefinitionStart(IRFunc* func, IRGlobalParam* entryPointGlobalParams, const String& funcName)
+{
+    auto resultType = func->getResultType();
+    
+    auto entryPointLayout = asEntryPoint(func);
+
+    // Emit the actual function
+    emitEntryPointAttributes(func, entryPointLayout);
+    emitType(resultType, funcName);
+
+    m_writer->emit("(ComputeVaryingInput* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)\n{\n");
+    emitSemantics(func);
+
+    m_writer->indent();
+    // Initialize when constructing so that globals are zeroed
+    m_writer->emit("Context context = {};\n");
+    m_writer->emit("context.uniformState = uniformState;\n");
+    m_writer->emit("context.varyingInput = *varyingInput;\n");
+
+    if (entryPointGlobalParams)
+    {
+        auto varDecl = entryPointGlobalParams;
+        auto rawType = varDecl->getDataType();
+
+        auto varType = rawType;
+
+        m_writer->emit("context.");
+        m_writer->emit(getName(varDecl));
+        m_writer->emit(" =  (");
+        emitType(varType);
+        m_writer->emit("*)params; \n");
+    }
+}
+
+void CPPSourceEmitter::_emitEntryPointDefinitionEnd(IRFunc* func)
+{
+    SLANG_UNUSED(func);
+    m_writer->dedent();
+    m_writer->emit("}\n");
+}
+
+// We want to order such that the largest range is the inner loop
+
+void CPPSourceEmitter::_emitEntryPointGroup(const UInt sizeAlongAxis[3], const String& funcName)
+{
+    struct AxisWithSize
+    {
+        typedef AxisWithSize ThisType;
+        bool operator<(const ThisType& rhs) const { return size < rhs.size; }
+
+        int axis;
+        UInt size;
+    };
+    List<AxisWithSize> axes;
+
+    for (int i = 0; i < 3; ++i)
+    {
+        if (sizeAlongAxis[i] > 1)
+        {
+            AxisWithSize axisWithSize;
+            axisWithSize.axis = i;
+            axisWithSize.size = sizeAlongAxis[i];
+            axes.add(axisWithSize);
+        }
+    }
+
+    axes.sort();
+
+    // Open all the loops
+    StringBuilder builder;
+    for (Index i = 0; i < axes.getCount(); ++i)
+    {
+        const auto& axis = axes[i];
+        builder.Clear();
+        const char elem[2] = { s_elemNames[axis.axis], 0 };
+        builder << "for (uint32_t " << elem << " = start." << elem << "; " << elem << " < start." << elem << " + " << axis.size << "; ++" << elem << ")\n{\n";
+        m_writer->emit(builder);
+        m_writer->indent();
+
+        builder.Clear();
+        builder << "context.dispatchThreadID." << elem << " = " << elem << ";\n";
+        m_writer->emit(builder);
+    }
+
+    // just call at inner loop point
+    m_writer->emit("context._");
+    m_writer->emit(funcName);
+    m_writer->emit("();\n");
+
+    // Close all the loops
+    for (Index i = Index(axes.getCount() - 1); i >= 0; --i)
+    {
+        m_writer->dedent();
+        m_writer->emit("}\n");
+    }
+}
+
 void CPPSourceEmitter::emitModuleImpl(IRModule* module)
 {
     List<EmitAction> actions;
@@ -2600,77 +2697,93 @@ void CPPSourceEmitter::emitModuleImpl(IRModule* module)
             auto entryPointLayout = asEntryPoint(func);
             if (entryPointLayout)
             {
-                auto resultType = func->getResultType();
-                auto name = getFuncName(func);
+                // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sv-dispatchthreadid
+                // SV_DispatchThreadID is the sum of SV_GroupID * numthreads and GroupThreadID.
 
-                // Emit the actual function
-                emitEntryPointAttributes(func, entryPointLayout);
-                emitType(resultType, name);
+                static const UInt kAxisCount = 3;
+                UInt sizeAlongAxis[kAxisCount];
 
-                m_writer->emit("(ComputeVaryingInput* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)\n{\n");
-                emitSemantics(func);
+                String funcName = getFuncName(func);
 
-                m_writer->indent();
-                // Initialize when constructing so that globals are zeroed
-                m_writer->emit("Context context = {};\n");
-                m_writer->emit("context.uniformState = uniformState;\n");
-                m_writer->emit("context.varyingInput = *varyingInput;\n");
+                {    
+                    _emitEntryPointDefinitionStart(func, entryPointGlobalParams, funcName);
 
-                if (entryPointGlobalParams)
-                {
-                    auto varDecl = entryPointGlobalParams;
-                    auto rawType = varDecl->getDataType();
+                    // Emit dispatchThreadID
+                    if (entryPointLayout->profile.GetStage() == Stage::Compute)
+                    {
+                        // TODO: this is kind of gross because we are using a public
+                        // reflection API function, rather than some kind of internal
+                        // utility it forwards to...
+                        spReflectionEntryPoint_getComputeThreadGroupSize((SlangReflectionEntryPoint*)entryPointLayout, kAxisCount, &sizeAlongAxis[0]);
 
-                    auto varType = rawType;
+                        m_writer->emit("context.dispatchThreadID = {\n");
+                        m_writer->indent();
 
-                    m_writer->emit("context.");
-                    m_writer->emit(getName(varDecl));
-                    m_writer->emit(" =  (");
-                    emitType(varType);
-                    m_writer->emit("*)params; \n");
-                }
-                
-                // Emit dispatchThreadID
-                if (entryPointLayout->profile.GetStage() == Stage::Compute)
-                {
-                    // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sv-dispatchthreadid
-                    // SV_DispatchThreadID is the sum of SV_GroupID * numthreads and GroupThreadID.
+                        StringBuilder builder;
+                        for (int i = 0; i < kAxisCount; ++i)
+                        {
+                            builder.Clear();
+                            const char elem[2] = {s_elemNames[i], 0};
+                            builder << "varyingInput->groupID." << elem << " * " << sizeAlongAxis[i] << " + varyingInput->groupThreadID." << elem;
+                            if (i < kAxisCount - 1)
+                            {
+                                builder << ",";
+                            }
+                            builder << "\n";
+                            m_writer->emit(builder);
+                        }
 
-                    static const UInt kAxisCount = 3;
-                    UInt sizeAlongAxis[kAxisCount];
+                        m_writer->dedent();
+                        m_writer->emit("};\n");
+                    }
 
-                    // TODO: this is kind of gross because we are using a public
-                    // reflection API function, rather than some kind of internal
-                    // utility it forwards to...
-                    spReflectionEntryPoint_getComputeThreadGroupSize((SlangReflectionEntryPoint*)entryPointLayout, kAxisCount, &sizeAlongAxis[0]);
+                    m_writer->emit("context._");
+                    m_writer->emit(funcName);
+                    m_writer->emit("();\n");
 
-                    m_writer->emit("context.dispatchThreadID = {\n");
-                    m_writer->indent();
+                    _emitEntryPointDefinitionEnd(func);
+                }
 
+                // Emit the group version which runs for all elements in a thread group
+                {
                     StringBuilder builder;
-                    
-                    for (int i = 0; i < kAxisCount; ++i)
+                    builder << getFuncName(func);
+                    builder << "_Group";
+
+                    String groupFuncName = builder;
+
+                    _emitEntryPointDefinitionStart(func, entryPointGlobalParams, groupFuncName);
+
+                    // Emit dispatchThreadID
+                    if (entryPointLayout->profile.GetStage() == Stage::Compute)
                     {
-                        builder.Clear();
-                        const char elem[2] = {s_elemNames[i], 0};
-                        builder << "varyingInput->groupID." << elem << " * " << sizeAlongAxis[i] << " + varyingInput->groupThreadID." << elem;
-                        if (i < kAxisCount - 1)
+                        spReflectionEntryPoint_getComputeThreadGroupSize((SlangReflectionEntryPoint*)entryPointLayout, kAxisCount, &sizeAlongAxis[0]);
+
                         {
-                            builder << ",";
+                            m_writer->emit("const uint3 start = {\n");
+                            m_writer->indent();
+                            for (int i = 0; i < kAxisCount; ++i)
+                            {
+                                builder.Clear();
+                                const char elem[2] = { s_elemNames[i], 0 };
+                                builder << "varyingInput->groupID." << elem << " * " << sizeAlongAxis[i];
+                                if (i < kAxisCount - 1)
+                                {
+                                    builder << ",";
+                                }
+                                builder << "\n";
+                                m_writer->emit(builder);
+                            }
+                            m_writer->dedent();
+                            m_writer->emit("};\n");
                         }
-                        builder << "\n";
-                        m_writer->emit(builder);
+                        m_writer->emit("context.dispatchThreadID = start;\n");
+
+                        _emitEntryPointGroup(sizeAlongAxis, funcName);
                     }
 
-                    m_writer->dedent();
-                    m_writer->emit("};\n");
+                    _emitEntryPointDefinitionEnd(func);
                 }
-
-                m_writer->emit("context._");
-                m_writer->emit(name);
-                m_writer->emit("();\n");
-                m_writer->dedent();
-                m_writer->emit("}\n");
             }
         }
     }
 
@@ -257,6 +257,10 @@ class CPPSourceEmitter: public CLikeSourceEmitter
 
     SlangResult _calcTextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName);
 
+    void _emitEntryPointDefinitionStart(IRFunc* func, IRGlobalParam* entryPointGlobalParams, const String& funcName);
+    void _emitEntryPointDefinitionEnd(IRFunc* func);
+    void _emitEntryPointGroup(const UInt sizeAlongAxis[3], const String& funcName);
+
     Dictionary<SpecializedIntrinsic, StringSlicePool::Handle> m_intrinsicNameMap;
     Dictionary<IRType*, StringSlicePool::Handle> m_typeNameMap;
 
 
@@ -1,4 +1,4 @@
-//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -compile-arg -O3
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,8 @@ class CPPCompiler: public RefObject`
`95`	`95`	`enum Enum : Flags`
`96`	`96`	`{`
`97`	`97`	`EnableExceptionHandling = 0x01,`
`98`		`- Verbose = 0x02,`
	`98`	`+ Verbose = 0x02,`
	`99`	`+ EnableSecurityChecks = 0x04,`
`99`	`100`	`};`
`100`	`101`	`};`
`101`	`102`
Original file line number	Diff line number	Diff line change
`@@ -1357,11 +1357,15 @@ SlangResult dissassembleDXILUsingDXC(`
`1357`	`1357`	`}`
`1358`	`1358`	`}`
`1359`	`1359`
`1360`		`- CPPCompiler::CompileOptions options;`
	`1360`	`+ typedef CPPCompiler::CompileOptions CompileOptions;`
	`1361`	`+ CompileOptions options;`
`1361`	`1362`
`1362`	`1363`	`// Set the source type`
`1363`	`1364`	`options.sourceType = (rawSourceLanguage == SourceLanguage::C) ? CPPCompiler::SourceType::C : CPPCompiler::SourceType::CPP;`
`1364`	`1365`
	`1366`	`+ // Disable exceptions and security checks`
	`1367`	`+ options.flags &= ~(CompileOptions::Flag::EnableExceptionHandling \| CompileOptions::Flag::EnableSecurityChecks);`
	`1368`	`+`
`1365`	`1369`	`// Generate a path a temporary filename for output module`
`1366`	`1370`	`String modulePath;`
`1367`	`1371`	`SLANG_RETURN_ON_FAIL(File::generateTemporary(UnownedStringSlice::fromLiteral("slang-generated"), modulePath));`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute`
	`1`	`+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -compile-arg -O3`
`2`	`2`	`//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute`
`3`	`3`	`//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12`
`4`	`4`	`//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute`