And suddenly: I've found the ds_gws_barrier instructions. Unfortunately I haven't found any documentation about it. If anyone knows it please tell me how it works.I gonna check it soon. What if it can make global synch across ALL the waves present in CUes o.o That gonna be jackpot
Hi realhet,
Yes it does synchronize all waves across all CUs.
I tried many global synch schemes and finally went to (global wave sync) gws_barriers.
Below is a simple example from my c compiler (gcnc). (opencl doesn't support global synchronization.)
The method can be seen in the C code.
The ISA instruction syntax is shown in the assembly code output just below
The example was cut from a 2D wave function where output A --> B and output B --> A.
global synch is a must unless you re-issue the kernel each time, greatly reducing performance.
Basically
1. Barriers must be initialized each time they are used.
2. Be careful to initialize the first barrier with the first wave to arrive, global id=0 may not work
3. I always use alternating 2 or 3 barriers, initialize barrier n+1 just before hitting barrier n
4. Barriers are initialized with the total number of waves running.
5. This is what I do, no guarantee it's the best way.
Note the assembler allows gs_xxxx insts, just change to ds_xxxx.
#include "../gcnc.h #define BAR0 0 //define barrier IDs #define BAR1 1 kernel void lccwave( __global float *restrict wav0, __global float *restrict vel0, __global unsigned int *restrict GLB, const int ops ) { register int gx,gy,gid,i,j,ret; gx=get_global_id(0); gy=get_global_id(1); gid=256*gy+gx; ret=atomic_inc(&GLB[0],999999); //must find first wave in!! if(ret==0)gws_init(255,BAR0); //first wave initializes barrier 0 // [some code] for(j=0;j<32;j++){ //---------------main loop // [block of code] if(gid==0)gws_init(255,BAR1); //wave 0 initialzes barrier 1 gws_barrier(BAR0); // hit barrier 0 // [block of code] if(gid==0)gws_init(255,BAR0); //wave 0 initialzes barrier 0 gws_barrier(BAR1); // hit barrier 1 } //-------------end main loop gws_barrier(BAR0); //exit hit barrier 0 required }
//-----------------------------------MARKED UP ASSEMBLY OUTPUT----------------------- .user elms= 3 .user_dimsi= 2 .user_sgpr = 12 .user_dims = 2 .uax.uav12 .uax.uav10 .uax.uav13 .uax.uav11 .uax.uav9 .ue(0) PTR_UAV_TABLE slot= 0 s[2:3] .ue(1) IMM_CONST_BUFFER slot= 0 s[4:7] .ue(2) IMM_CONST_BUFFER slot= 1 s[8:11] .file 2,"cl\lccwave_2buf.cl" .nvgpr: 21 .nsgpr: 32 x_set_nvgpr 127 //assembler directives x_set_nsgpr 104 //assembler directives x_set_ldsmax 0x400 //assembler directives s_mov_b32 m0,0xffff s_movk_i32 s103, 21 s_movk_i32 s102, 32 s_buffer_load_dwordx2 s[0:1], s[4:7], 0x04 s_waitcnt lgkmcnt(0) s_mul_i32 s0, s12, s0 v_add_i32 v7, vcc, s0, v0 v_mov_b32 v8, v0 s_mul_i32 s1, s13, s1 v_add_i32 v6, vcc, s1, v1 v_mov_b32 v9, v1 .uavp to s[10:11] s_mov_b64 s[10:11], s[2:3] s_load_dwordx4 s[16:19], s[10:11], 0x60 s_load_dwordx4 s[20:23], s[10:11], 0x50 s_buffer_load_dword s24, s[8:11], 0x00 s_buffer_load_dword s25, s[8:11], 0x04 s_buffer_load_dword s26, s[8:11], 0x08 s_buffer_load_dword s27, s[8:11], 0x0c s_buffer_load_dword s28, s[8:11], 0x10 s_buffer_load_dword s29, s[8:11], 0x14 s_buffer_load_dword s30, s[8:11], 0x18 s_buffer_load_dword s31, s[8:11], 0x1c s_waitcnt lgkmcnt(0) v_mov_b32 v12, v7 v_mov_b32 v13, v6 v_lshlrev_b32 v18, 8, v13 v_add_i32 v14, vcc, v18, v12 s_load_dwordx4 s[20:23], s[10:11], 0x48 s_waitcnt lgkmcnt(0) v_mov_b32 v0, 0xf423f v_mov_b32 v1, s28 buffer_atomic_inc v0, v1, s[20:23], 0 offen glc // initial sync code s_waitcnt vmcnt(0) // initial sync code v_mov_b32 v17, v0 // initial sync code s_mov_b64 s[32:33], exec // initial sync code v_cmpx_eq_i32 vcc, 0, v17 // initial sync code s_cbranch_execz label_2 // initial sync code v_mov_b32 v1, 255 // barrier code before loop gs_gws_init v1 offset0:0 // barrier code s_waitcnt lgkmcnt(0) // barrier code label_2: s_mov_b64 exec, s[32:33] s_mov_b64 s[32:33], exec v_mov_b32 v16, 0 label_4: s_mov_b64 s[34:35], exec v_cmpx_eq_i32 vcc, 0, v14 s_cbranch_execz label_10 v_mov_b32 v1, 255 // barrier code mainloop gs_gws_init v1 offset0:1 // barrier code s_waitcnt lgkmcnt(0) // barrier code label_10: s_mov_b64 exec, s[34:35] gs_gws_barrier offset0:0 s_waitcnt lgkmcnt(0) s_mov_b64 s[34:35], exec v_cmpx_eq_i32 vcc, 0, v14 s_cbranch_execz label_12 v_mov_b32 v1, 255 // barrier code main loop gs_gws_init v1 offset0:0 // barrier code s_waitcnt lgkmcnt(0) // barrier code label_12: s_mov_b64 exec, s[34:35] gs_gws_barrier offset0:1 s_waitcnt lgkmcnt(0) label_5: v_add_i32 v16, vcc, 1, v16 v_cmpx_gt_i32 vcc, 32, v16 s_cbranch_execnz label_4 s_mov_b64 exec, s[32:33] gs_gws_barrier offset0:0 // barrier code end s_waitcnt lgkmcnt(0) // barrier code label_1: s_endpgm .end lccwave endbye..