/*
 * Copyright (C) 2021 Intel Corporation
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them
 * is governed by the express license under which they were provided to you ("License"). Unless
 * the License provides otherwise, you may not use, modify, copy, publish, distribute, disclose
 * or transmit this software or the related documents without Intel's prior written permission.
 *
 * This software and the related documents are provided as is, with no express or implied
 * warranties, other than those that are expressly stated in the License.
*/

#define SUBGROUP_SIZE 8
#define KERNEL_LOOP_ITERATIONS 500
#define NUM_SENDS 16

#define BYTES_PER_SEND 128
#define THREAD_TILE_SIZE (NUM_SENDS * BYTES_PER_SEND)

#define VECTOR_SIZE 4
#define SEND_SIZE	(sizeof(float)*VECTOR_SIZE)

//slot - memory that associated with one EU

__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
__kernel void GlobalAdd5(__global float4 *const pa,
    __global float4* pb, unsigned int slotMask, unsigned int sliceSize, unsigned int sliceMask)
{
    // calculate "slot" index
    const uint gid = get_global_id(0);
    const uint sid = ((get_group_id(0) * get_num_sub_groups()) + get_sub_group_id()) % slotMask;
    uint startOffset = 0;
    uint i = 0, j = 0;
    float4 _out_data = { 0.0f, 0.0f, 0.0f, 0.0f };
    float4 _input_data;
    // iterate over the slot KERNEL LOOP ITERATIONS times
    for (j = 0; j < KERNEL_LOOP_ITERATIONS; j++)
    {
        i = 0;
        uint slotOffset = sliceSize * ( j &sliceMask);
		startOffset	=  (((THREAD_TILE_SIZE * sid)  + slotOffset ) / SEND_SIZE );

        __attribute__((opencl_unroll_hint(NUM_SENDS)))
        do
        { 
            _input_data = as_float4(intel_sub_group_block_read4((__global const uint *)((__global uint* const)(&pa[startOffset]) )));
            _out_data += _input_data;
            startOffset += SUBGROUP_SIZE;
        }
        while(++i < NUM_SENDS );
    }

    if (_out_data.x < 0.0f)
    // Prevent compiler optimization from throwing away read
    {
        pb[gid] = _out_data;
    }
}
