@@ -473,6 +473,12 @@ __ZL1a:
473
473
subroutine7 = append (subroutine7 , Subroutine {name : "SimdSse2Bgr48pToBgra32" , body : srcRetInMiddle [36 :291 ]})
474
474
475
475
testSubroutine (t , srcRetInMiddle , subroutine7 )
476
+
477
+ disabledForTesting = false
478
+ subroutine8 := []Subroutine {}
479
+ subroutine8 = append (subroutine8 , Subroutine {name : "sample_sum_sse4_2" , body : srcLabelHasSpecialComment [11 :113 ]})
480
+
481
+ testSubroutine (t , srcLabelHasSpecialComment , subroutine8 )
476
482
}
477
483
478
484
var srcClang = strings .Split (` .text
@@ -1172,3 +1178,123 @@ jmp .LBB0_36
1172
1178
1173
1179
.ident "clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"
1174
1180
.section ".note.GNU-stack","",@progbits` , "\n " )
1181
+
1182
+ var srcLabelHasSpecialComment = strings .Split (` .text
1183
+ .intel_syntax noprefix
1184
+ .file "sample.c"
1185
+ .globl sample_sum_sse4_2 # -- Begin function sample_sum_sse4_2
1186
+ .p2align 4, 0x90
1187
+ .type sample_sum_sse4_2,@function
1188
+ sample_sum_sse4_2: # @sample_sum_sse4_2
1189
+ # %bb.0:
1190
+ push rbp
1191
+ mov rbp, rsp
1192
+ and rsp, -8
1193
+ test rsi, rsi
1194
+ jle .LBB0_1
1195
+ # %bb.2:
1196
+ lea rcx, [rdi + 8*rsi]
1197
+ lea rax, [rdi + 8]
1198
+ cmp rcx, rax
1199
+ cmova rax, rcx
1200
+ mov r9, rdi
1201
+ not r9
1202
+ add r9, rax
1203
+ shr r9, 3
1204
+ add r9, 1
1205
+ cmp r9, 4
1206
+ jae .LBB0_4
1207
+ # %bb.3:
1208
+ xor eax, eax
1209
+ jmp .LBB0_13
1210
+ .LBB0_1:
1211
+ xor eax, eax
1212
+ .LBB0_14:
1213
+ mov rsp, rbp
1214
+ pop rbp
1215
+ ret
1216
+ .LBB0_4:
1217
+ mov r8, r9
1218
+ and r8, -4
1219
+ lea rsi, [r8 - 4]
1220
+ mov rdx, rsi
1221
+ shr rdx, 2
1222
+ add rdx, 1
1223
+ mov eax, edx
1224
+ and eax, 3
1225
+ cmp rsi, 12
1226
+ jae .LBB0_6
1227
+ # %bb.5:
1228
+ pxor xmm0, xmm0
1229
+ xor esi, esi
1230
+ pxor xmm1, xmm1
1231
+ test rax, rax
1232
+ jne .LBB0_9
1233
+ jmp .LBB0_11
1234
+ .LBB0_6:
1235
+ mov esi, 1
1236
+ sub rsi, rdx
1237
+ lea rdx, [rax + rsi]
1238
+ add rdx, -1
1239
+ pxor xmm0, xmm0
1240
+ xor esi, esi
1241
+ pxor xmm1, xmm1
1242
+ .p2align 4, 0x90
1243
+ .LBB0_7: # =>This Inner Loop Header: Depth=1
1244
+ movdqu xmm2, xmmword ptr [rdi + 8*rsi]
1245
+ paddq xmm2, xmm0
1246
+ movdqu xmm0, xmmword ptr [rdi + 8*rsi + 16]
1247
+ paddq xmm0, xmm1
1248
+ movdqu xmm1, xmmword ptr [rdi + 8*rsi + 32]
1249
+ movdqu xmm3, xmmword ptr [rdi + 8*rsi + 48]
1250
+ movdqu xmm4, xmmword ptr [rdi + 8*rsi + 64]
1251
+ paddq xmm4, xmm1
1252
+ paddq xmm4, xmm2
1253
+ movdqu xmm2, xmmword ptr [rdi + 8*rsi + 80]
1254
+ paddq xmm2, xmm3
1255
+ paddq xmm2, xmm0
1256
+ movdqu xmm0, xmmword ptr [rdi + 8*rsi + 96]
1257
+ paddq xmm0, xmm4
1258
+ movdqu xmm1, xmmword ptr [rdi + 8*rsi + 112]
1259
+ paddq xmm1, xmm2
1260
+ add rsi, 16
1261
+ add rdx, 4
1262
+ jne .LBB0_7
1263
+ # %bb.8:
1264
+ test rax, rax
1265
+ je .LBB0_11
1266
+ .LBB0_9:
1267
+ lea rdx, [rdi + 8*rsi]
1268
+ add rdx, 16
1269
+ neg rax
1270
+ .p2align 4, 0x90
1271
+ .LBB0_10: # =>This Inner Loop Header: Depth=1
1272
+ movdqu xmm2, xmmword ptr [rdx - 16]
1273
+ paddq xmm0, xmm2
1274
+ movdqu xmm2, xmmword ptr [rdx]
1275
+ paddq xmm1, xmm2
1276
+ add rdx, 32
1277
+ add rax, 1
1278
+ jne .LBB0_10
1279
+ .LBB0_11:
1280
+ paddq xmm0, xmm1
1281
+ pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
1282
+ paddq xmm1, xmm0
1283
+ movq rax, xmm1
1284
+ cmp r9, r8
1285
+ je .LBB0_14
1286
+ # %bb.12:
1287
+ lea rdi, [rdi + 8*r8]
1288
+ .p2align 4, 0x90
1289
+ .LBB0_13: # =>This Inner Loop Header: Depth=1
1290
+ add rax, qword ptr [rdi]
1291
+ add rdi, 8
1292
+ cmp rdi, rcx
1293
+ jb .LBB0_13
1294
+ jmp .LBB0_14
1295
+ .Lfunc_end0:
1296
+ .size sample_sum_sse4_2, .Lfunc_end0-sample_sum_sse4_2
1297
+ # -- End function
1298
+ .section .rodata.cst16,"aM",@progbits,16
1299
+ .p2align 4 # -- Begin function sample_max_sse4_2
1300
+ ` , "\n " )
0 commit comments