Skip to content

Commit 7f567df

Browse files
author
Tim Foley
authored
Add support for (undocumented) HLSL 16-bit bit-cast ops (shader-slang#1528)
As of SM 6.2, the dxc compiler added support for a set of 16-bit bit-cast operations to mirror the `asuint`, `asfloat`, and `asint` operations that were provided for 32-bit scalar types. These operations are not publicly documented, so we didn't think to add them. It should be noted that there was already a similar operation in HLSL, called `f32tof16`, that took as input a `float` and then packed a half-precision version of it into the low bits of a `uint`. The problem is that using that operation for `half`->`uint16_t` conversion required a round trip through a `float`, and downstream compilers seemingly can't optimize away that conversion. This change adds the new operations along with a test that tries to make use of them to ensure the results are what is expected. There are enough cases to cover that I had to write the test in a way where each thread only writes out a subset of the required output. There are two other changes here are that are not directly related to the main feature: First, it seems like the `[__forceInlineEarly]` attribute on some of these overloads interacts poorly with generics, and results in an `IRVectorType` appearing at local scope in the output code. That is semantically reasonable given our IR model, but it would ideally be something that gets eliminated as a result of deduplication of types. For now I've introduced a slight hack to make types always get inlined into their use sites during emission, which should handle the case of locally-defined types. I'm not 100% happy with that solution, but it seemed better than introducing a bunch of unrelated fixes into this PR. Second, the way that conversion operations were being declared for matrix types seems to have been incorrect: we had a single *explicit* initializer added to matrix types via an `extension` that allowed them to be initialized from other matrix types with the same size and *any* element type. In order to support implicit conversions of matrix types, I cribbed the code we were already using to introduce implicit conversion operations for vector types.
1 parent c2873f4 commit 7f567df

File tree

5 files changed

+219
-7
lines changed

5 files changed

+219
-7
lines changed

source/slang/core.meta.slang

+27-7
Original file line numberDiff line numberDiff line change
@@ -520,13 +520,6 @@ for( int C = 2; C <= 4; ++C )
520520
}
521521
sb << ");\n";
522522

523-
524-
// initialize from another matrix of the same size
525-
//
526-
// TODO(tfoley): See comment about how this overlaps
527-
// with implicit conversion, in the `vector` case above
528-
sb << "__generic<U> __init(matrix<U," << R << ", " << C << ">);\n";
529-
530523
// initialize from a matrix of larger size
531524
for(int rr = R; rr <= 4; ++rr)
532525
for( int cc = C; cc <= 4; ++cc )
@@ -537,6 +530,33 @@ for( int C = 2; C <= 4; ++C )
537530

538531
sb << "}\n";
539532
}
533+
534+
for (int tt = 0; tt < kBaseTypeCount; ++tt)
535+
{
536+
if(kBaseTypes[tt].tag == BaseType::Void) continue;
537+
auto toType = kBaseTypes[tt].name;
538+
}}}}
539+
__generic<let R : int, let C : int> extension matrix<$(toType),R,C>
540+
{
541+
${{{{
542+
for (int ff = 0; ff < kBaseTypeCount; ++ff)
543+
{
544+
if(kBaseTypes[ff].tag == BaseType::Void) continue;
545+
if( tt == ff ) continue;
546+
547+
auto cost = getBaseTypeConversionCost(
548+
kBaseTypes[tt],
549+
kBaseTypes[ff]);
550+
auto fromType = kBaseTypes[ff].name;
551+
}}}}
552+
__implicit_conversion($(cost))
553+
__init(matrix<$(fromType),R,C> value);
554+
${{{{
555+
}
556+
}}}}
557+
}
558+
${{{{
559+
}
540560
}}}}
541561

542562

source/slang/hlsl.meta.slang

+65
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,71 @@ __generic<let N : int, let M : int>
990990
matrix<uint,N,M> asuint(matrix<uint,N,M> x)
991991
{ return x; }
992992

993+
994+
// 16-bit bitcast ops (HLSL SM 6.2)
995+
//
996+
// TODO: We need to map these to GLSL/SPIR-V
997+
// operations that don't require an intermediate
998+
// conversion to fp32.
999+
1000+
// Identity cases:
1001+
1002+
[__unsafeForceInlineEarly] float16_t asfloat16(float16_t value) { return value; }
1003+
[__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
1004+
[__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
1005+
1006+
[__unsafeForceInlineEarly] int16_t asint16(int16_t value) { return value; }
1007+
[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
1008+
[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
1009+
1010+
[__unsafeForceInlineEarly] uint16_t asuint16(uint16_t value) { return value; }
1011+
[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
1012+
[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
1013+
1014+
// Signed<->unsigned cases:
1015+
1016+
[__unsafeForceInlineEarly] int16_t asint16(uint16_t value) { return value; }
1017+
[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
1018+
[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
1019+
1020+
[__unsafeForceInlineEarly] uint16_t asuint16(int16_t value) { return value; }
1021+
[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
1022+
[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
1023+
1024+
// Float->unsigned cases:
1025+
1026+
__target_intrinsic(hlsl)
1027+
__target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
1028+
uint16_t asuint16(float16_t value);
1029+
1030+
vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
1031+
{ VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }
1032+
1033+
matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
1034+
{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }
1035+
1036+
// Unsigned->float cases:
1037+
1038+
__target_intrinsic(hlsl)
1039+
__target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
1040+
float16_t asfloat16(uint16_t value);
1041+
1042+
vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
1043+
{ VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }
1044+
1045+
matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
1046+
{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }
1047+
1048+
// Float<->signed cases:
1049+
1050+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); }
1051+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
1052+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
1053+
1054+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }
1055+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
1056+
__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
1057+
9931058
// Inverse tangent (HLSL SM 1.0)
9941059
__generic<T : __BuiltinFloatingPointType>
9951060
__target_intrinsic(hlsl)

source/slang/slang-emit-c-like.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,18 @@ bool CLikeSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
974974
// for temporary variables.
975975
auto type = inst->getDataType();
976976

977+
// We treat instructions that yield a type as things we should *always* fold.
978+
//
979+
// TODO: In general, at the point where we emit code we do not expect to
980+
// find types being constructed locally (inside function bodies), but this
981+
// can end up happening because of interaction between different features.
982+
// Notably, if a generic function gets force-inlined early in codegen,
983+
// then any types it constructs will be inlined into the body of the caller
984+
// by default.
985+
//
986+
if(as<IRType>(inst) || as<IRTypeKind>(type))
987+
return true;
988+
977989
// Unwrap any layers of array-ness from the type, so that
978990
// we can look at the underlying data type, in case we
979991
// should *never* expose a value of that type
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// bit-cast-16-bit.slang
2+
3+
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_2
4+
//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
5+
6+
//TEST_INPUT:ubuffer(data=[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], stride=4):name inputBuffer
7+
RWStructuredBuffer<int> inputBuffer;
8+
9+
int16_t readI(inout int index) { return inputBuffer[(index++) & 0xF]; }
10+
uint16_t readU(inout int index) { return inputBuffer[(index++) & 0xF]; }
11+
float16_t readF(inout int index) { return float(inputBuffer[(index++) & 0xF]); }
12+
13+
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
14+
RWStructuredBuffer<int> outputBuffer;
15+
16+
void write(int initial, inout int index, uint value)
17+
{
18+
let tmp = index++;
19+
if((tmp & 3) == initial)
20+
{
21+
outputBuffer[tmp & 0xF] = value;
22+
}
23+
}
24+
25+
void write(int initial, inout int index, uint16_t value)
26+
{
27+
write(initial, index, uint(value));
28+
}
29+
30+
void write(int initial, inout int index, int16_t value)
31+
{
32+
write(initial, index, uint(int(value)));
33+
}
34+
35+
void write(int initial, inout int index, float16_t value)
36+
{
37+
write(initial, index, asuint(float(value)));
38+
}
39+
40+
41+
void test(int initial)
42+
{
43+
int input = initial;
44+
int output = 0;
45+
46+
// Scalar
47+
{
48+
let i = readI(input);
49+
let u = readU(input);
50+
let f = readF(input);
51+
52+
// int->float
53+
let a = asfloat16(i);
54+
write(initial, output, a);
55+
56+
// float->uint
57+
let b = asuint16(f);
58+
write(initial, output, b);
59+
60+
// uint->int
61+
let c = asint16(u);
62+
write(initial, output, c);
63+
64+
// float->float
65+
let d = asfloat16(f);
66+
write(initial, output, d);
67+
}
68+
69+
// Vector
70+
{
71+
let i = int16_t2(readI(input), readI(input));
72+
let u = uint16_t2(readU(input), readU(input));
73+
let f = float16_t2(readF(input), readF(input));
74+
75+
// uint->float
76+
let a = asfloat16(u);
77+
write(initial, output, a.x);
78+
write(initial, output, a.y);
79+
80+
// float->int
81+
let b = asint16(f);
82+
write(initial, output, b.x);
83+
write(initial, output, b.y);
84+
85+
// int->uint
86+
let c = asuint16(i);
87+
write(initial, output, c.x);
88+
write(initial, output, c.y);
89+
90+
// int->int
91+
let d = asint16(i);
92+
write(initial, output, d.x);
93+
write(initial, output, d.y);
94+
}
95+
96+
}
97+
98+
99+
[numthreads(4, 1, 1)]
100+
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
101+
{
102+
test(dispatchThreadID.x);
103+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
0
2+
4200
3+
3
4+
40A00000
5+
34A00000
6+
34E00000
7+
4880
8+
4980
9+
3
10+
5
11+
5
12+
7

0 commit comments

Comments
 (0)