Skip to content

Commit eb54d54

Browse files
committed
gpu_unai: asm part 3
1 parent 93b00bc commit eb54d54

File tree

4 files changed

+67
-26
lines changed

4 files changed

+67
-26
lines changed

plugins/gpu_unai/gpu_arm.S

+50-17
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "arm_features.h"
99

10+
.syntax unified
1011
.text
1112
.align 2
1213

@@ -32,13 +33,13 @@
3233
ldrh r8, [r2, r8]
3334
ldrh lr, [r2, lr]
3435
tst r6, r6
35-
strneh r6, [r0, #\obase+0]
36+
strhne r6, [r0, #\obase+0]
3637
tst r7, r7
37-
strneh r7, [r0, #\obase+2]
38+
strhne r7, [r0, #\obase+2]
3839
tst r8, r8
39-
strneh r8, [r0, #\obase+4]
40+
strhne r8, [r0, #\obase+4]
4041
tst lr, lr
41-
strneh lr, [r0, #\obase+6]
42+
strhne lr, [r0, #\obase+6]
4243
.endm
4344

4445
@ in: r0=dst, r2=pal, r12=0x1fe
@@ -53,13 +54,13 @@
5354
ldrh r8, [r2, r8]
5455
ldrh \rs,[r2, \rs]
5556
tst r6, r6
56-
strneh r6, [r0, #0]
57+
strhne r6, [r0, #0]
5758
tst r7, r7
58-
strneh r7, [r0, #2]
59+
strhne r7, [r0, #2]
5960
tst r8, r8
60-
strneh r8, [r0, #4]
61+
strhne r8, [r0, #4]
6162
tst \rs,\rs
62-
strneh \rs,[r0, #6]
63+
strhne \rs,[r0, #6]
6364
.endm
6465

6566
.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines)
@@ -175,7 +176,7 @@ sprite_driver_4bpp_asm:
175176
ldrh r7, [r2, r7]
176177
add r0, r0, #2
177178
tst r7, r7
178-
strneh r7, [r0, #-2]
179+
strhne r7, [r0, #-2]
179180
subs r8, r8, #1
180181
bgt 0b
181182
sprite_driver_part3
@@ -200,7 +201,7 @@ sprite_driver_8bpp_asm:
200201
ldrh r7, [r2, r7]
201202
add r0, r0, #2
202203
tst r7, r7
203-
strneh r7, [r0, #-2]
204+
strhne r7, [r0, #-2]
204205
subs r8, r8, #1
205206
bgt 0b
206207
sprite_driver_part3
@@ -254,7 +255,7 @@ poly_4bpp_asm:
254255
add r0, r0, #2
255256
mov r7, r4, lsr #13
256257
tst r12,r12
257-
strneh r12,[r0, #-2]
258+
strhne r12,[r0, #-2]
258259
subs r2, r2, #1
259260
bgt 0b
260261

@@ -285,7 +286,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
285286
and lr, r7, r9
286287
tst r12,r12
287288
add lr, r1, lr, lsl #1
288-
strneh r12,[r0, #-2]
289+
strhne r12,[r0, #-2]
289290
mov r12,r4, lsr #13
290291
subs r2, r2, #1
291292
bgt 0b
@@ -304,7 +305,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
304305
smulbb \t0, \t0, \mbr @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx
305306
smulbt \t1, \t1, \mg @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000
306307
smulbt \t2, \t2, \mbr @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000
307-
and \rp, \rp, #0x8000
308+
ands \rp, \rp, #0x8000 @ retain msb + semi-transparency test
308309
usat \t0, #5, \t0, asr #14
309310
usat \t1, #5, \t1, asr #19
310311
usat \t2, #5, \t2, asr #24
@@ -313,13 +314,25 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
313314
orr \rp, \rp, \t2, lsl #10
314315
.endm
315316

316-
.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
317-
poly_4bpp_l_asm:
317+
@ http://www.slack.net/~ant/info/rgb_mixing.html
318+
@ p0 = (p0 + p1) / 2; p1 |= 0x8000
319+
@ msb of input p0 is assumed to be set
320+
.macro semitrans0 p0 p1 t
321+
eor \t, \p0, \p1
322+
and \t, \t, #0x0420
323+
sub \p0, \p0, \t
324+
orr \p1, \p1, #0x8000
325+
uhadd16 \p0, \p0, \p1
326+
.endm
327+
328+
.macro poly_4bpp_asm_m name semitrans
329+
.global \name @ (void *d, const struct gpu_unai_inner_t *inn, int count)
330+
\name:
318331
.cfi_startproc
319332
stmfd sp!, {r4-r11,lr}
320333
.cfi_def_cfa_offset 4*9
321334
.cfi_rel_offset lr, 4*8
322-
poly_4bpp_init poly_4bpp_lv_asm 1
335+
poly_4bpp_init v_\name 1
323336
0:
324337
mov r12,r4, lsr #13
325338
subs r2, r2, #1
@@ -337,12 +350,20 @@ poly_4bpp_l_asm:
337350
tst r12,r12
338351
beq 0b
339352
modulate r12, r10, r11, r7, r8, lr
353+
.if \semitrans < 0
354+
@ no semi-transparency
355+
.elseif \semitrans == 0
356+
ldrhne r7, [r0, #-2]
357+
strheq r12,[r0, #-2]
358+
beq 0b
359+
semitrans0 r12, r7, lr
360+
.endif
340361
strh r12,[r0, #-2]
341362
b 0b
342363
1:
343364
ldmfd sp!, {r4-r11,pc}
344365

345-
poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
366+
v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
346367
sub sp, sp, #4*2
347368
.cfi_def_cfa_offset 4*(9+2)
348369
.cfi_rel_offset lr, 4*(8+2)
@@ -372,13 +393,25 @@ poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
372393
tst r12,r12
373394
beq 0b
374395
modulate r12, r10, r11, r5, r6, lr
396+
.if \semitrans < 0
397+
@ no semi-transparency
398+
.elseif \semitrans == 0
399+
ldrhne r7, [r0, #-2]
400+
strheq r12,[r0, #-2]
401+
beq 0b
402+
semitrans0 r12, r7, lr
403+
.endif
375404
strh r12,[r0, #-2]
376405
ldmia sp, {r5,r6}
377406
b 0b
378407
1:
379408
add sp, sp, #4*2
380409
ldmfd sp!, {r4-r11,pc}
381410
.cfi_endproc
411+
.endm
412+
413+
poly_4bpp_asm_m poly_4bpp_l_asm, -1
414+
poly_4bpp_asm_m poly_4bpp_l_st0_asm, 0
382415

383416
#endif // HAVE_ARMV6
384417

plugins/gpu_unai/gpu_arm.h

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
1616

1717
void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
1818
void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
19+
void poly_4bpp_l_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
1920

2021
#ifdef __cplusplus
2122
}

plugins/gpu_unai/gpu_inner.h

+11-8
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "gpu_inner_light.h"
5757

5858
#include "arm_features.h"
59+
#include "compiler_features.h"
5960
#ifdef __arm__
6061
#include "gpu_inner_blend_arm.h"
6162
#include "gpu_inner_light_arm.h"
@@ -372,7 +373,7 @@ typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
372373
const spriteDriverArg *arg);
373374

374375
template<int CF>
375-
static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
376+
static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
376377
const spriteDriverArg *arg)
377378
{
378379
// Blend func can save an operation if it knows uSrc MSB is unset.
@@ -557,7 +558,7 @@ const PS gpuSpriteDrivers[256] = {
557558
// relevant blend/light headers.
558559
// (see README_senquack.txt)
559560
template<int CF>
560-
static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
561+
static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
561562
{
562563
// Blend func can save an operation if it knows uSrc MSB is unset.
563564
// Untextured prims can always skip this (src color MSB is always 0).
@@ -754,11 +755,13 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
754755

755756
#ifdef __arm__
756757
template<int CF>
757-
static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
758+
static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
758759
{
759760
switch (CF) {
760-
case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break;
761-
case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break;
761+
case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break;
762+
case 0x21: poly_4bpp_l_asm (pDst, &gpu_unai.inn, count); break;
763+
case 0x23: poly_4bpp_l_st0_asm(pDst, &gpu_unai.inn, count); break;
764+
default: gpuPolySpanFn<CF>(gpu_unai, pDst, count);
762765
}
763766
}
764767
#endif
@@ -778,12 +781,12 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
778781
#define TI(cf) gpuPolySpanFn<(cf)>
779782
#define TN PolyNULL
780783
#ifdef __arm__
781-
#define TA(cf) PolySpanAsm<(cf)>
784+
#define TA(cf) PolySpanMaybeAsm<(cf)>
782785
#else
783786
#define TA(cf) TI(cf)
784787
#endif
785788
#ifdef HAVE_ARMV6
786-
#define TA6(cf) PolySpanAsm<(cf)>
789+
#define TA6(cf) PolySpanMaybeAsm<(cf)>
787790
#else
788791
#define TA6(cf) TI(cf)
789792
#endif
@@ -792,7 +795,7 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
792795
TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \
793796
TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \
794797
TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \
795-
TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
798+
TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
796799
TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
797800
TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
798801
TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \

plugins/gpu_unai/gpu_inner_blend_arm.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,14 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
4141
asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
4242
"and %[mix], %[mix], %[mask]\n\t" // ... & 0x0421
4343
"sub %[mix], %[uDst], %[mix]\n\t" // uDst - ...
44+
#ifdef HAVE_ARMV6
45+
"uhadd16 %[mix], %[uSrc], %[mix]\n\t"
46+
#else
4447
"add %[mix], %[uSrc], %[mix]\n\t" // uSrc + ...
4548
"mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1
49+
#endif
4650
: [mix] "=&r" (mix)
47-
: [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
51+
: [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0420)); // 421
4852
}
4953

5054
if (BLENDMODE == 1 || BLENDMODE == 3) {

0 commit comments

Comments
 (0)