5. Tensor Operation Backend

After experimenting with different neon implementations and developing kernels for our GEMM and BRGEMM, and most recently for the unary primitives, it is now time to combine all of these kernels together in a backend.

5.1 User Interface

The first thing that we need for our backend is a common entry point. Our common entry point is our setup function. Within the setup function we parse a number of configuration parameters, from which the corresponding kernels and primitives are constructed at runtime.

In our setup function, we check several things:

error handling for dimensions, execution types and data type

/////////////////////////////////////////////////////////////////////
// Check the number of dimensions
/////////////////////////////////////////////////////////////////////
if (dim_types.size() != dim_sizes.size() ||
    dim_types.size() != strides_in0.size() ||
    dim_types.size() != strides_in1.size() ||
    dim_types.size() != strides_out.size())
{
    return error_t::wrong_dimension;
}

/////////////////////////////////////////////////////////////////////
// Check the number of prim exec types
/////////////////////////////////////////////////////////////////////
int prim_count = std::count(exec_types.begin(), exec_types.end(), exec_t::prim);
if (prim_main == ptype_t::brgemm && prim_count != 4)
{
    return error_t::wrong_exec_type;
}
else if (prim_main == ptype_t::gemm && prim_count != 3)
{
    return error_t::wrong_exec_type;
}
else if (prim_main == ptype_t::identity && prim_count != 2)
{
    return error_t::wrong_exec_type;
}
else if (prim_main == ptype_t::add || prim_main == ptype_t::sub || 
         prim_main == ptype_t::mul || prim_main == ptype_t::div || 
         prim_main == ptype_t::min || prim_main == ptype_t::max)
{
    if (prim_count != 2)
    {
        return error_t::wrong_exec_type;
    }
}

/////////////////////////////////////////////////////////////////////
// Check allowed data type
/////////////////////////////////////////////////////////////////////
if (dtype != dtype_t::fp32)
{
    return error_t::wrong_dtype;
}

/////////////////////////////////////////////////////////////////////
// Check allowed primitive types
/////////////////////////////////////////////////////////////////////
std::vector<ptype_t> allowed_first_touch_types = {
    ptype_t::none,
    ptype_t::zero,
    ptype_t::relu, 

assigning configuration parameters to member variables

    ptype_t::reciprocal,
    ptype_t::increment,
    ptype_t::decrement
};
std::vector<ptype_t> allowed_main_types = {
    ptype_t::none,
    ptype_t::identity, 
    ptype_t::brgemm, 
    ptype_t::gemm,
    ptype_t::add,
    ptype_t::sub,
    ptype_t::mul,
    ptype_t::div,
    ptype_t::min,
    ptype_t::max
};
std::vector<ptype_t> allowed_last_touch_types = {
    ptype_t::none,
    ptype_t::relu, 
    ptype_t::square, 
    ptype_t::reciprocal,

find the first prim and seq position in exec_types

    ptype_t::decrement,
    ptype_t::fast_sigmoid,
    ptype_t::sigmoid_interp,
    ptype_t::sigmoid_taylor,
};

if (std::find(allowed_first_touch_types.begin(), allowed_first_touch_types.end(), prim_first_touch) == allowed_first_touch_types.end())
{
    return error_t::wrong_ptype;
}
if (std::find(allowed_main_types.begin(), allowed_main_types.end(), prim_main) == allowed_main_types.end())
{
    return error_t::wrong_ptype;
}
if (std::find(allowed_last_touch_types.begin(), allowed_last_touch_types.end(), prim_last_touch) == allowed_last_touch_types.end())
{
    return error_t::wrong_ptype;
}

/////////////////////////////////////////////////////////////////////
// Assign member variables
/////////////////////////////////////////////////////////////////////

We need to know the position of the first prim, to determine when we need to call the main kernel instead of recursively going deeper into the loop structure. In other words, we traverse the first sequential loops and as soon as we reach the first primary dimension, we start calling the main kernel.

assign the size of the prim dimensions according to the order in dim_types

m_dim_id_seq_N = -1;
m_dim_id_seq_K = -1;
m_dim_id_sha_M = -1;
m_dim_id_sha_N = -1;
m_num_parallel_loops = 0;

/////////////////////////////////////////////////////////////////////
// Find first PRIM and SEQ dimensions in exec types
/////////////////////////////////////////////////////////////////////
auto it = std::find(exec_types.begin(), exec_types.end(), exec_t::prim);
if (it != exec_types.end())
{
    m_id_first_primitive_loop = std::distance(exec_types.begin(), it);
}
else
{
    m_id_first_primitive_loop = 0;
}

it = std::find(exec_types.begin(), exec_types.end(), exec_t::seq);
if (it != exec_types.end())
{
    m_id_first_seq_loop = std::distance(exec_types.begin(), it);
}
else
{
    m_id_first_seq_loop = -1;

assign the size of the seq and shared dimensions according to the order in dim_types

/////////////////////////////////////////////////////////////////////
// Find SHARED dimensions in exec types
/////////////////////////////////////////////////////////////////////
m_shared_loop_ids.clear();
m_shared_loop_sizes.clear();
for (size_t i = 0; i < m_exec_types.size(); ++i)
{
    if (m_exec_types[i] == exec_t::shared)
    {
        m_shared_loop_ids.push_back(i);
        m_shared_loop_sizes.push_back(m_dim_sizes[i]);
    }
}

/////////////////////////////////////////////////////////////////////
// Read PRIM dimensions using dim types (No Copy)
/////////////////////////////////////////////////////////////////////
// convert to int so negative values are allowed
int l_dim_types_size = static_cast<int>(m_dim_types.size());
for (int i = l_dim_types_size - 1; i >= 0; i--)
{
    if (m_exec_types[i] == exec_t::prim)
    {
        if (m_dim_id_prim_M == -1 && m_dim_types[i] == dim_t::m)
        {
            m_dim_id_prim_M = i;
        }
        else if (m_dim_id_prim_N == -1 && m_dim_types[i] == dim_t::n)
        {
            m_dim_id_prim_N = i;
        }
        else if (m_dim_id_prim_K == -1 && m_dim_types[i] == dim_t::k)
        {

After checking all these things, we were then able to create our kernels accordingly.

construct kernels based on assigned member variables

// Check for Transposition
/////////////////////////////////////////////////////////////////////
if (m_dim_id_prim_M != -1)
{
    int64_t l_stride_in0 = m_strides_in0[m_dim_id_prim_M];
    int64_t l_stride_out = m_strides_out[m_dim_id_prim_M];
    // set transpose flag to true if the strides are different
    m_transpose_output = l_stride_in0 != l_stride_out;
}
else
{
    // idk if we can check for transposition without M
    m_transpose_output = false;
}

/////////////////////////////////////////////////////////////////////
// Adjust strides based on primitive type and transposition
/////////////////////////////////////////////////////////////////////
if (prim_main == ptype_t::identity)
{
    if (!m_transpose_output)
    {
        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
        m_adjusted_stride_in1 = 0;
        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
    }
    else
    {
        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
        m_adjusted_stride_in1 = 0;
        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_M];
    }
}
else if(prim_main == ptype_t::add || prim_main == ptype_t::sub ||
        prim_main == ptype_t::mul || prim_main == ptype_t::div ||
        prim_main == ptype_t::min || prim_main == ptype_t::max)
{
        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
        m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
}
else
{
    // GEMM & BRGEMM
    m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
    m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
    m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
}
m_adjusted_br_size_A = m_dim_id_prim_BR != -1 ? m_strides_in0[m_dim_id_prim_BR] : 1;
m_adjusted_br_size_B = m_dim_id_prim_BR != -1 ? m_strides_in1[m_dim_id_prim_BR] : 1;

/////////////////////////////////////////////////////////////////////
// Generate kernels
/////////////////////////////////////////////////////////////////////
if (prim_first_touch != ptype_t::none)
{
    // no transposition
    m_unary_first_touch.generate(m_dim_sizes[m_dim_id_prim_M],
                                 m_dim_sizes[m_dim_id_prim_N],
                                 0,
                                 dtype,
                                 prim_first_touch);

5.2 Recursive Loops over Primitives

After constructing our kernels, we still needed to build together an execution function, in order to combine our main primitive with our first and last touches.

Our starting point is an execute function that takes the pointers to our matrices and passes them to our execute_iter function.

starting point: execute function

                           1,
                           0,
                           0,
                           0,
                           dtype);
    m_kernel_gemm_main = m_brgemm_main.get_kernel();
}
else if (prim_main == ptype_t::brgemm)
{
    // no transposition
    m_brgemm_main.generate(m_dim_sizes[m_dim_id_prim_M],
                           m_dim_sizes[m_dim_id_prim_N],
                           m_dim_sizes[m_dim_id_prim_K],
                           m_dim_sizes[m_dim_id_prim_BR],
                           0,

The ‘real’ execution happens in the execute_iter function. We first check if the current iteration is the first or last access to a block in our output matrix. Next, we update the pointers to the matrices accordingly.

calculate if it is the first or last access in our output matrix and update pointers

{
    // no main kernel
    m_kernel_gemm_main = nullptr;
    m_kernel_binary_main = nullptr;
    m_kernel_unary_main = nullptr;
}

if (prim_last_touch != ptype_t::none)
{
    // no transposition
    m_unary_last_touch.generate(m_dim_sizes[m_dim_id_prim_M],
                                m_dim_sizes[m_dim_id_prim_N],

In the following step, we use our execute_iter function to recursively call the execute_iter function based on how many seq dimensions exist in our exec_types.

recursive call to execute_iter

                                prim_last_touch);
    m_kernel_last_touch = m_unary_last_touch.get_kernel();
}

m_kernel_first_touch_type = prim_first_touch;
m_kernel_main_type = prim_main;
m_kernel_last_touch_type = prim_last_touch;

m_has_been_setup = true;

If we have no further recursive call, we can execute the kernels.

execute the kernels

}

void mini_jit::TensorOperation::execute(void const *tensor_in0,
                                        void const *tensor_in1,
                                        void *tensor_out)
{
    if (!m_has_been_setup)
    {
        std::cerr << "TensorOperation has not been setup. Call setup() before execute()." << std::endl;
        return;
    }

    auto ptr_in0 = static_cast<char const *>(tensor_in0);
    auto ptr_in1 = static_cast<char const *>(tensor_in1);
    auto ptr_out = static_cast<char *>(tensor_out);

    if (m_num_parallel_loops == 0)
    {
        // No shared loops, execute sequentially
        execute_iter(0,

5.3 Performance Benchmarking

To test the performance of our at runtime constructed kernels and to see if everything works seamlessly together, we were performing some reference benchmarks.

We were given a number of configuration parameters, that we should check:

Benchmark Configuration
Variable	1st Value	2nd Value	3rd Value
dtype	FP32	FP32	FP32
prim_first_touch	None	None	Zero
prim_main	GEMM	BRGEMM	BRGEMM
prim_last_touch	None	None	ReLU
dim_types	(M, N, K, M, N, K)	(M, N, K, M, N, K)	(M, N, K, M, N, K)
exec_types	(Seq, Seq, Seq, Prim, Prim, Prim)	(Seq, Seq, Prim, Prim, Prim, Prim)	(Seq, Seq, Prim, Prim, Prim, Prim)
dim_sizes	(32, 32, 8, 32, 32, 32)	(32, 32, 8, 32, 32, 32)	(32, 32, 8, 32, 32, 32)
strides_in0	(8192, 0, 1024, 1, 0, 32)	(8192, 0, 1024, 1, 0, 32)	(8192, 0, 1024, 1, 0, 32)
strides_in1	(0, 8192, 1024, 0, 32, 1)	(0, 8192, 1024, 0, 32, 1)	(0, 8192, 1024, 0, 32, 1)
strides_out	(32768, 1024, 0, 1, 32, 0)	(32768, 1024, 0, 1, 32, 0)	(32768, 1024, 0, 1, 32, 0)

However, when benchmarking our implementation with these strides, we were running into memory leaks. For that reason, we decided to adjust the strides of the benchmarks slightly.

Our Benchmark Configuration (bold are changes)
Variable	1st Value	2nd Value	3rd Value
dim_sizes	(32, 32, 8, 32, 32, 32)	(32, 32, 8, 32, 32, 32)	(32, 32, 8, 32, 32, 32)
strides_in0	(1024, 0, 32768, 1, 0, 32)	(1024, 0, 32768, 1, 0, 32)	(1024, 0, 32768, 1, 0, 32)
strides_in1	(0, 8192, 32, 0, 1024, 1)	(0, 8192, 32, 0, 1024, 1)	(0, 8192, 32, 0, 1024, 1)
strides_out	(32, 32768, 0, 1, 1024, 0)	(32, 32768, 0, 1, 1024, 0)	(32, 32768, 0, 1, 1024, 0)

When benchmarking our configurations we achieved the following GFLOP performance:

GFLOP performance of our benchmark configuration

Running TensorOperationBench benchmark #1
Total time (s):                  3.0052
Total reps:                      398
Total floating point operations: 213674622976
Estimated GFLOPS/sec:            71.1017
--------------------------------------------------
Running TensorOperationBench benchmark #2
Total time (s):                  3.0062
Total reps:                      413
Total floating point operations: 221727686656
Estimated GFLOPS/sec:            73.7568
--------------------------------------------------
Running TensorOperationBench benchmark #3
Total time (s):                  3.00738
Total reps:                      400
Total floating point operations: 214748364800
Estimated GFLOPS/sec:            71.4071
--------------------------------------------------

The results show that we achieve between 71-73 GFLOPs for all our executions. These results are somewhat consistent with calling the kernels themselves independently.

Note

Since the submission we made some minor changes to our implementation. First, we fixed some errors and were then able to use the strides that we were provided with. Secondly, we decided to enhance our matmul_m_n_k implementation. Afterwards we able to calculate kernels of size 16x4 instead of 8x4. This helped us increase the results from 71-73 GFLOPs to around 90-91 GFLOPs.

GFLOP performance initial benchmark configuration with enhanced matmul kernel

Running TensorOperationBench benchmark #1
Total time (s):                  3.00343
Total reps:                      510
Total floating point operations: 273804165120
Estimated GFLOPS/sec:            91.1637
--------------------------------------------------
Running TensorOperationBench benchmark #2
Total time (s):                  3.00537
Total reps:                      514
Total floating point operations: 275951648768
Estimated GFLOPS/sec:            91.8195
--------------------------------------------------
Running TensorOperationBench benchmark #3
Total time (s):                  3.00405
Total reps:                      505
Total floating point operations: 271119810560
Estimated GFLOPS/sec:            90.2515
--------------------------------------------------

5.4 Shared Memory Parallelization

To enable the execution of shared loops, we needed to make a few adjustments to our setup code:

gather shared loop id’s and dimension sizes

                           0,
                           0,
                           0,
                           dtype);
    m_kernel_gemm_main = m_brgemm_main.get_kernel();
}
else if (prim_main == ptype_t::identity)
{
    m_unary_main.generate(m_dim_sizes[m_dim_id_prim_M],

assign the size of the shared dimensions according to the order in dim_types

for (int i = l_dim_types_size - 1; i >= 0; i--)
{
    if (m_exec_types[i] == exec_t::prim)
    {
        if (m_dim_id_prim_M == -1 && m_dim_types[i] == dim_t::m)
        {
            m_dim_id_prim_M = i;
        }
        else if (m_dim_id_prim_N == -1 && m_dim_types[i] == dim_t::n)
        {
            m_dim_id_prim_N = i;
        }
        else if (m_dim_id_prim_K == -1 && m_dim_types[i] == dim_t::k)

In our execute function we would just needed to check if our m_num_parallel_loops variable would be greater than zero. If this was the case we would execute our execute_iter_parallel function:

multiply shared loop sizes to get total number of iterations

The idea is to get a flat iteration space that can be used to parallelize over.

We ‘unflatten’ the OpenMP iteration index l_it_all into a set of loop indices, one for each shared loop dimension. These indices are then used to compute the offsets for the in0, in1, and out tensors:

calculate the tensor offsets

                                             bool last_access)
{
    // there is only one iteration if the dimension is the first primitive
    const int64_t l_size = id_loop != m_id_first_primitive_loop ? m_dim_sizes[id_loop] : 1;
    const int64_t dtype_sz = dtype_size();
    const int64_t l_stride_in0 = m_strides_in0[id_loop] * dtype_sz;
    const int64_t l_stride_in1 = m_strides_in1[id_loop] * dtype_sz;
    const int64_t l_stride_out = m_strides_out[id_loop] * dtype_sz;

    for (int64_t l_iter = 0; l_iter < l_size; l_iter++)
    {
        bool is_first = first_access;
        bool is_last = last_access;
        // if the size is 1, it is always the first and last access

Here we are calculating the offset for the current thread. Every shared loop contributes to the calculation with its corresponding stride.

Lastly, we call our execute_iter function. Depending on whether we have a seq dimension, we need to be careful, which id we pass to the function:

call remaining loops with execute_iter

execute_iter(id_loop + 1,
             sub_ptr_in0,
             sub_ptr_in1,
             sub_ptr_out,
             is_first,
             is_last);

We were also executing the benchmark configurations from the sequential execution task. We were executing our benchmarks using OMP_NUM_THREADS=4:

GFLOP performance for 4 shared loop execution

Running SharedTensorOperationBench benchmark #1
Total time (s):                  3.00107
Total reps:                      1993
Total floating point operations: 1069983727616
Estimated GFLOPS/sec:            356.534
--------------------------------------------------
Running SharedTensorOperationBench benchmark #2
Total time (s):                  3.00002
Total reps:                      2168
Total floating point operations: 1163936137216
Estimated GFLOPS/sec:            387.976
--------------------------------------------------
Running SharedTensorOperationBench benchmark #3
Total time (s):                  3.00093
Total reps:                      2126
Total floating point operations: 1141387558912
Estimated GFLOPS/sec:            380.345
--------------------------------------------------

With the parallelization we achieve about 360 - 390 GFLOPs.

5.5 Optimization Passes

Our approach to enhancing the performance of the tensor operations was to use a vector of struct’s for each dimension that we have got:

call remaining loops with execute_iter

struct Dimension
{
    //! Type of the dimension (M, N, K)
    dim_t type = dim_t::m;
    //! Execution type (Prim, Seq, Shared, ...)
    exec_t exec_type = exec_t::undefined;
    //! Dimension size
    int64_t size = 0;
    //! Stride in the first input tensor
    int64_t stride_in0 = 0;
    //! Stride in the second input tensor
    int64_t stride_in1 = 0;
    //! Stride in the output tensor
    int64_t stride_out = 0;

    /**
     * @brief Construct a new Dimension object.
     *
     * @param type Type of the dimension (M, N, K).
     * @param exec_type Execution type (Prim, Seq, Shared, ...).
     * @param size Size of the dimension.
     * @param stride_in0 Stride in the first input tensor.
     * @param stride_in1 Stride in the second input tensor.
     * @param stride_out Stride in the output tensor.
     */
    Dimension(dim_t type,
              exec_t exec_type,
              int64_t size,
              int64_t stride_in0,
              int64_t stride_in1,
              int64_t stride_out)
        : type(type),
          exec_type(exec_type),
          size(size),
          stride_in0(stride_in0),
          stride_in1(stride_in1),
          stride_out(stride_out)
    {
        if (size <= 0)
        {
            throw std::invalid_argument("Dimension size needs to be greater than 0");
        }
    }
};

This struct is used to store all information about a dimension.

After setting this up, we could create our optimization passes.

5.5.1 Primitive Identification

The first optimization that we performed was to find primitive dimensions. This optimization would be useful, for cases, where we were given only sequential loops. Our approach to this optimization was the following:

finding the K2 prim dimension for the BRGEMM case

        }
    }

    // lastly, set all remaining dimensions to seq
    for (auto &dim : dimensions)
    {
        if (dim.exec_type == exec_t::undefined)
        {
            dim.exec_type = exec_t::seq;
        }
    }

    return; // all primary dimensions set
}
// BINARY CASE
else if (!l_has_k_dim)
{
    // check for existing primary dimensions
    int prim_count = std::count_if(dimensions.begin(), dimensions.end(),
                                     [](const mini_jit::ir::Dimension &dim)
                                     {
                                         return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
                                     });

We were trying to identify the respective dimensions by looking at the strides of the in1 and out tensors. As a starting point we use that for column-major BRGEMM’s we have to have a certain mask for our tensors ...M, ...K1 -> ...M, which we were trying to follow. This means that the K2 that we would need for our BRGEMM should not have any unit-stride in the first input tensor.

Similarly, we did the same for:

M dimension
N dimension
K1 dimension

For the N dimension, where we did not have any indication about which matrix to choose, we simply choose the N dimension, with the smallest stride:

finding the N prim dimension with smallest stride

        std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
    }
}
else
{
    throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
}

/////////////////////////////////////////////////////////////////
// FIND PRIM N
/////////////////////////////////////////////////////////////////
// req: choose the one with smallest stride
int l_n_dim_stride = INT_MAX;
int l_n_dim_id = -1;
for (size_t i = 0; i < dimensions.size(); i++)
{
    if (dimensions[i].type == dim_t::n &&
        dimensions[i].stride_in0 == dimensions[i].stride_in1)
    {

We did the ‘identification’ process in the order K2, M, N, K1. The reason for this order was that after the identification, we would rotate the respective dimension to the end of the order. This would then ultimately lead to the structure: ..., K2, M, N, K1 for our ‘identified’ primitive dimensions.

5.5.2 Dimension Splitting

For our second optimization pass we decided to look at the dimension sizes of our loops. We introduced a max_kernel_size parameter, which specifies the maximum allowed size for a dimension. If a dimension with a size larger than the maximum is found, the dimension splitter will try to split it into new dimensions with optimized sizes. The entry point for this optimization is the splitDimensions function:

splitDimensions function of the Optimizer

void mini_jit::ir::Optimizer::splitDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
                                              int64_t max_kernel_size)
{
    // Dimensions should be split if they are too large (> max_kernel_size)
    for (size_t i = 0; i < dimensions.size(); i++)
    {
        if (dimensions[i].size > max_kernel_size)
        {
            int64_t l_size_dim_0 = 0;
            int64_t l_size_dim_1 = 0;
            findBestSplit(dimensions[i].size,
                          max_kernel_size,
                          dimensions[i].type,
                          l_size_dim_0,
                          l_size_dim_1);
            if (l_size_dim_0 > 1)
            {
                // create a new seq dimension
                mini_jit::ir::Dimension l_dim_new(dimensions[i].type,
                                                  exec_t::seq,
                                                  l_size_dim_0,
                                                  dimensions[i].stride_in0 * l_size_dim_1,
                                                  dimensions[i].stride_in1 * l_size_dim_1,
                                                  dimensions[i].stride_out * l_size_dim_1);
                // update the original dimension size
                dimensions[i].size = l_size_dim_1;
                // insert the new dimension at the back, so it will be checked for a split again
                dimensions.push_back(l_dim_new);
            }
        }
    }
}

For each dimension, it finds the bets split for our kernels if the dimension size is too large and creates a new dimension. The size of the original dimension is updated to l_size_dim_1, and it will be smaller than or equal to max_kernel_size. However, the new dimension l_dim_new might still have a larger dimension size than max_kernel_size, which is why it is inserted at the end of the dimensions vector, where it will be checked for a possible split in a later iteration.

But what does findBestSplit do?

The way our kernels were implemented makes their execution more efficient for specific dimension sizes. Considering the M dimension, a size that is a multiple of 16 is optimal for most kernels, since we manually optimized the kernels for this case. As for the N dimension size, a multiple of 4 is optimal for most kernels. In the K dimension, we do not have such optimizations and the dimension size can be chosen freely, as long as it is smaller than max_kernel_size. The following code snippet shows the implementation of findBestSplit for the M and N dimensions:

findBestSplit function of the Optimizer for M and N

o_size_0 = 1;
o_size_1 = i_size;
if (i_type == dim_t::m)
{
    // multiples of (multiples of) 4 are efficient (LDP, STP)
    for (int64_t i = 16; i > 4; i -= 4)
    {
        findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, o_size_0, o_size_1);
        if (o_size_0 > 1)
        {
            return;
        }
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
}
// for n, we want multiples of 4
else if (i_type == dim_t::n)
{
    // split by 4
    findLargestMultipleOfDivisor(4, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
}

But what does findLargestMultipleOfDivisor do?

As the name suggests, this helper function tries to find the largest multiple of a given divisor. Let’s say the given divisor is 16, the input dimension size is 1600 and the i_max_kernel_size is 1024. Then, findLargestMultipleOfDivisor will try to find the largest multiple of 16 which divides 1600 and is smaller than or equal to 1024. The result of this computation is 2 for o_size_0 and 800 for o_size_1. For the more curious reader, the implementation of findLargestMultipleOfDivisor is given below:

findLargestMultipleOfDivisor function of the Optimizer

void mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
                                                           int64_t i_size,
                                                           int64_t i_max_size,
                                                           int64_t &o_size_0,
                                                           int64_t &o_size_1)
{
    if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_divisor > i_max_size)
    {
        return;
    }

    // start: largest multiple of i_divisor < i_max_size
    int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
    for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
    {
        // we found an m that divides i_size! it is also the largest
        if (i_size % l_m == 0)
        {
            o_size_0 = i_size / l_m;
            o_size_1 = l_m;
            return;
        }
    }
}

5.5.3 Shared Memory Parallelization

Our third optimization pass was to make all loops that were not a prim dimension and of the dimension-type M or N a shared loop. For that we initially check how many loops are already of dimension-type shared:

finding possible iterations for shared loops

{
    if (dimensions[i].type == dim_t::n &&
        dimensions[i].stride_in0 == 0)
    {
        int l_current_strides = dimensions[i].stride_in1 + dimensions[i].stride_out;
        if (l_current_strides < l_n_dim_strides)
        {
            l_n_dim_strides = l_current_strides;

For the case that we already have a high number of shared loops we do not create any more and simply return. Otherwise we check the seq dimensions for potential candidates:

select shared loop candidates

if (l_n_dim_id != static_cast<int>(dimensions.size()) - 1)
{
    std::rotate(dimensions.begin() + l_n_dim_id, dimensions.begin() + l_n_dim_id + 1, dimensions.end());
}
/////////////////////////////////////////////////////////////////
// FIND PRIM K
/////////////////////////////////////////////////////////////////
// req: unit stride in in1, stride_out has to be 0
auto l_dim_k_it = std::find_if(dimensions.begin(), dimensions.end(),
                               [](const mini_jit::ir::Dimension &dim)
                               {
                                   return dim.type == dim_t::k &&
                                          dim.stride_in1 == 1 &&

As a last step we move all our shared loops to the front of the order:

move shared loops to the front

if (l_dim_k_it != dimensions.end())
{
    // set dimension data
    l_dim_k_it->type = dim_t::k;

5.5.4 Dimension Fusion

Note

This part of the Optimizer was implemented much later, as part of the 7.4 Progress of Week 2 of our final project phase.

The idea behind Dimension Fusion is that when certain dimensions have very small sizes, fusing them can improve cache efficiency and simplify tensor expressions. It also enables our existing dimension splitter to operate more effectively, as it can now split the fused dimensions in ways optimized for our kernels, rather than being constrained by the original tensor structure. In other words, Dimension Fusion will be the first step in our optimizer, simplifying the tensor expression upfront so it can then be split in an optimized way and finally, have its primitive dimensions identified.

The first step was to introduce a new min_kernel_size parameter. It allows the user to specify the minimum dimension size a kernel should have. If a dimension is smaller than that, the dimension fuser will try to look for candidates to fuse with. This process happens in the new fuseDimensions function of the Optimizer.

Dimension Fusing in the Optimizer

void mini_jit::ir::Optimizer::fuseDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
                                             int64_t min_kernel_size)
{
    for (size_t i = 0; i < dimensions.size(); i++)
    {
        mini_jit::ir::Dimension &l_dim_0 = dimensions[i];
        if (l_dim_0.size < min_kernel_size)
        {
            // find a dimension that can be fused with the current one
            for (size_t j = 0; j < dimensions.size(); j++)
            {
                if (i == j) continue; // skip self

                mini_jit::ir::Dimension &l_dim_1 = dimensions[j];
                if (l_dim_0.type == l_dim_1.type &&
                    (l_dim_0.exec_type == l_dim_1.exec_type ||
                    l_dim_0.exec_type == exec_t::undefined ||
                    l_dim_1.exec_type == exec_t::undefined) &&
                    l_dim_1.stride_in0 == l_dim_0.size * l_dim_0.stride_in0 &&
                    l_dim_1.stride_in1 == l_dim_0.size * l_dim_0.stride_in1 &&
                    l_dim_1.stride_out == l_dim_0.size * l_dim_0.stride_out)
                {
                    // fuse the two dimensions
                    l_dim_0.size *= l_dim_1.size;
                    // remove the fused dimension
                    dimensions.erase(dimensions.begin() + j);
                    j--; // adjust index after erasing
                }
            }
        }
    }
}

Here, l_dim_0 is the dimension whose size is smaller than min_kernel_size, meaning that we would like to fuse it with another candidate. However, the candidate (l_dim_1) the function looks for needs to fulfill some criteria:

Same dimension type as l_dim_0 (M, N, K, C)
Same execution type as l_dim_0, or either type is undefined
The stride of l_dim_1 needs to equal the product of the stride and size of l_dim_0 (Two dimensions X and Y can be fused can be fused if for all tensors: stride(X) = |Y| ⨉ stride(Y))

If a fitting candidate has been found, l_dim_0 and l_dim_1 can be fused. This involves multiplying the dimension sizes and removing the candidate from the dimensions vector. The strides do not need to be adjusted, as the original stride of the small l_dim_0 is still correct.

After implementing dimension fusion, we also had to make adjustments to the dimension splitter. Previously, we would split dimensions by finding the largest possible split for one dimension. For example, if the given dimension size was 1600 and the maximum kernel size 1024, the function would have returned 2 for o_size_0 and 800 for o_size_1. This is because 800 is the largest multiple of 16 that is less than or equal to 1024. This was problematic however, because we then had a dimension of size 2, which was very small and could have lead to inefficiencies. Our solution to this problem was to also introduce the min_kernel_size parameter to the dimension splitter as well. Specifically, we adjusted the findBestSplit function, which now returns a split if the minimum_kernel_size is reached:

Updated findBestSplit function for M dimensions

if (i_type == dim_t::m)
{
    // multiples of (multiples of) 4 are efficient (LDP, STP)
    for (int64_t i = 16; i > 4; i -= 4)
    {
        findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
        if (o_size_0 >= i_min_kernel_size)
        {
            return;
        }
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
    if (o_size_0 >= i_min_kernel_size)
    {
        return;
    }
}

Consequently, findLargestMultipleOfDivisor had to be adjusted as well, with a simple if-condition:

Updated findLargestMultipleOfDivisor functionalities

void mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
                                                          int64_t i_size,
                                                          int64_t i_max_size,
                                                          int64_t i_min_size,
                                                          int64_t &o_size_0,
                                                          int64_t &o_size_1)
{
    if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_min_size <= 0 ||
        i_divisor > i_max_size || i_size < i_min_size)
    {
        return;
    }

    // start: largest multiple of i_divisor < i_max_size
    int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
    for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
    {
        // we found an m that divides i_size! it is also the largest
        if (i_size % l_m == 0)
        {
            int64_t candidate_size_0 = i_size / l_m;
            int64_t candidate_size_1 = l_m;
            if (candidate_size_0 >= i_min_size && candidate_size_1 >= i_min_size)
            {
                o_size_0 = candidate_size_0;
                o_size_1 = candidate_size_1;
                return;
            }
        }
    }
}

Candidates for splitting are now only chosen if both dimension sizes are at least as large as the specified minimum kernel size. Therefore, the new dimension splitter now outputs 50 and 32 as a split of 1600, if min_kernel_size is set to 16.

5.5.6 Performance Benchmarks

We also benchmarked the results for some configurations:

GFLOP performance for sample configurations

Running SharedTensorOperationBench benchmark #1
Total time (s):                  3.01164
Total reps:                      101
Total floating point operations: 827392000000
Estimated GFLOPS/sec:            274.731
--------------------------------------------------
#####################################################
Testing different kernel sizes
#####################################################
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 1024)
Total time (s):                  3.02859
Total reps:                      105
Total floating point operations: 860160000000
Estimated GFLOPS/sec:            284.013
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 1024)
Total time (s):                  3.02753
Total reps:                      105
Total floating point operations: 860160000000
Estimated GFLOPS/sec:            284.113
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 512)
Total time (s):                  3.0037
Total reps:                      95
Total floating point operations: 778240000000
Estimated GFLOPS/sec:            259.093
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 512)
Total time (s):                  3.00784
Total reps:                      95
Total floating point operations: 778240000000
Estimated GFLOPS/sec:            258.738
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 256)
Total time (s):                  3.01035
Total reps:                      112
Total floating point operations: 917504000000
Estimated GFLOPS/sec:            304.783
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 256)
Total time (s):                  3.00121
Total reps:                      110
Total floating point operations: 901120000000
Estimated GFLOPS/sec:            300.253
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 125)
Total time (s):                  3.00583
Total reps:                      129
Total floating point operations: 1056768000000
Estimated GFLOPS/sec:            351.573
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 125)
Total time (s):                  3.00655
Total reps:                      129
Total floating point operations: 1056768000000
Estimated GFLOPS/sec:            351.488
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 64)
Total time (s):                  3.01405
Total reps:                      119
Total floating point operations: 974848000000
Estimated GFLOPS/sec:            323.435
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 64)
Total time (s):                  3.01289
Total reps:                      119
Total floating point operations: 974848000000
Estimated GFLOPS/sec:            323.559
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 32)
Total time (s):                  3.00815
Total reps:                      125
Total floating point operations: 1024000000000
Estimated GFLOPS/sec:            340.409
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 32)
Total time (s):                  3.00952
Total reps:                      126
Total floating point operations: 1032192000000
Estimated GFLOPS/sec:            342.975
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 16)
Total time (s):                  3.0137
Total reps:                      40
Total floating point operations: 327680000000
Estimated GFLOPS/sec:            108.73
--------------------------------------------------
Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 16)
Total time (s):                  3.01834
Total reps:                      119
Total floating point operations: 974848000000
Estimated GFLOPS/sec:            322.975
--------------------------------------------------

Depending on the selected dimensions our results varied massively. The highest performance we achieved was around 350 GFLOPs.

5.6 Unary Operations

After supporting the standard primitives, with GEMM and BRGEMM we would now allow also primitives like copy / identity, tranposition or permutation.

5.6.1 Backend Extension

The distinction to the other primitves here is, that all dimensions are of type dim_t::c. To allow this kind of primitives, we would have to make some small adjustments to our TensorOperation:

find M and N dimensions based on stride in the input

        }
        else if (m_dim_id_prim_K != -1 && m_dim_id_prim_BR == -1 && m_dim_types[i] == dim_t::k)
        {
            m_dim_id_prim_BR = i;
        }
    }
}

/////////////////////////////////////////////////////////////////////
// Read SEQ and SHARED dimensions using dim types
/////////////////////////////////////////////////////////////////////
for (size_t i = 0; i < m_dim_types.size(); ++i)
{
    if (m_exec_types[i] == exec_t::seq)
    {
        if (m_dim_types[i] == dim_t::m)
        {
            m_dim_id_seq_M = i;
        }
        else if (m_dim_types[i] == dim_t::n)
        {

generate identity primitive

        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
}
else
{
    // GEMM & BRGEMM
    m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
    m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
    m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
}

5.6.2 Optimization Passes

To run our optimization passes on these primitives, we would again have to make some adjustment, this time in our Optimizer.

For our identifyPrimitives function we would first check if we have a dim_t::c as a dimension type. If this was the case, we would continue to find the prim dimensions:

error handling for correct dimension types

// Handle identity case first
auto l_has_c_dim = std::any_of(dimensions.begin(), dimensions.end(),
                               [](const mini_jit::ir::Dimension &dim)
                               {
                                   return dim.type == dim_t::c;
                               });

// Handle binary case
auto l_has_k_dim = std::any_of(dimensions.begin(), dimensions.end(),

exit early, if all prim dimensions are already set

                               [](const mini_jit::ir::Dimension &dim)
                               {
                                   return dim.type == dim_t::k;
                               });

if (l_has_c_dim)
{
    // check that all dimensions are c
    if (!std::all_of(dimensions.begin(), dimensions.end(),
                     [](const mini_jit::ir::Dimension &dim)

exit early, if all prim dimensions are already set

}
// check for existing primary dimensions
int prim_c_count = std::count_if(dimensions.begin(), dimensions.end(),
                                 [](const mini_jit::ir::Dimension &dim)
                                 {
                                     return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
                                 });
if (prim_c_count == 2)
{
    return; // primary dimensions already set
}
else if (prim_c_count != 0)
{
    throw std::invalid_argument("Optimizer: Expected 0 or 2 primary dimensions of type 'c', found " + std::to_string(prim_c_count) + ". Try setting all dimensions to seq or undefined.");
}

/////////////////////////////////////////////////////////////////
// FIND UNARY PRIM M
/////////////////////////////////////////////////////////////////
// req: unit stride in in0. Out might not be 1 if operation transposes.
auto l_dim_m_it = std::find_if(dimensions.begin(), dimensions.end(),
                               [](const mini_jit::ir::Dimension &dim)
                               {
                                   return (dim.type == dim_t::c) &&
                                          dim.stride_in0 == 1 &&
                                          dim.stride_in1 == 0;
                               });

bool l_transpose = false;

check for transposition and find dimensions accordingly

{
    // transpose if the output stride in M is not 1
    l_transpose = l_dim_m_it->stride_out != 1;
    // set dimension data
    l_dim_m_it->exec_type = exec_t::prim;
    if (l_dim_m_it != dimensions.end() - 1)
    {
        // move M to the back
        std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
    }
}
else
{
    throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
}

if (l_transpose)
{
    /////////////////////////////////////////////////////////////////
    // FIND UNARY PRIM N
    /////////////////////////////////////////////////////////////////
    // req: unit stride in out.
    auto l_dim_n_it = std::find_if(dimensions.begin(), dimensions.end(),
                                   [](const mini_jit::ir::Dimension &dim)
                                   {
                                       return (dim.type == dim_t::c) &&
                                              dim.stride_out == 1 &&
                                              dim.stride_in1 == 0;
                                   });

If we do not have a transposition we would simply look for the smallest stride and set the dimensions accordingly:

set dimensions for identity primitive

        l_dim_n_it->exec_type = exec_t::prim;
        if (l_dim_n_it != dimensions.end() - 1)
        {
            // move N to the back
            std::rotate(l_dim_n_it, l_dim_n_it + 1, dimensions.end());
        }
    }
    else
    {
        throw std::invalid_argument("Optimizer: No suitable primary dimension N found.");
    }
}
else
{
    // TODO: check if this is ok

    /////////////////////////////////////////////////////////////////
    // FIND UNARY PRIM N
    /////////////////////////////////////////////////////////////////
    // req: choose the one with the smallest stride in in0
    int l_n_dim_stride = INT_MAX;

The last step would be to set the remaining undefined dimensions to seq, as the next optimization would be to find ideal shared loops.

However, in our createSharedLoops function, we did not have to make any adjustments.

For our splitDimensions function, we would now also check if we had a dim_t::c as a dimension type:

split dimensions of type dim_t::c

        // increase thread number for each existing shared dimension
        l_num_threads *= dimensions[i].size;
    }
}

if (l_num_threads >= thread_target)
{
    // make sure that the shared loops are at the front
    std::stable_partition(dimensions.begin(), dimensions.end(),
                          [](const mini_jit::ir::Dimension &dim)
                          {
                              return dim.exec_type == exec_t::shared;
                          });
    // no need to create more shared loops
    return;
}

// Creation of new shared loops:
for (size_t i = 0; i < dimensions.size(); i++)
{
    // if the dimension can be set to shared and we did not reach the target number of threads yet
    // we set the dimension to shared
    // also dont parallelize the k dimension (see class slides)
    if ((dimensions[i].exec_type == exec_t::seq || dimensions[i].exec_type == exec_t::undefined) &&
        dimensions[i].type != dim_t::k &&

5.6.3 Reference Implementation

For our reference implementation, we were using an example with 4 dimensions trus. We would change this trus order to turs.

initialize sizes for tensors

{
    const mini_jit::ptype_t first_touch_type = mini_jit::ptype_t::none;
    const mini_jit::ptype_t main_type = mini_jit::ptype_t::identity;
    const mini_jit::ptype_t last_touch_type = mini_jit::ptype_t::none;

    const int T = GENERATE(3, 4, 7);
    const int R = GENERATE(3, 4, 7);
    const int U = GENERATE(3, 4, 7);
    const int S = GENERATE(3, 4, 7);

    const int SIZE = T * R * U * S;

    float *A = new float[SIZE];
    float *C = new float[SIZE];
    float *C_expected = new float[SIZE];

    std::random_device rd;
    std::mt19937 gen(rd());

fill the tensors with values

for (int i = 0; i < SIZE; ++i)
{
    A[i] = dist(gen);
}

// Compute C_expected
for (int t = 0; t < T; ++t)
{
    for (int r = 0; r < R; ++r)
    {
        for (int u = 0; u < U; ++u)
        {
            for (int s = 0; s < S; ++s)
            {
                // Calculate index in output format (t,r,u,s) using strides_out
                int l_idx_c_exp = t * (U * R * S) + r * S + u * (R * S) + s;
                // Calculate index in input format (t,u,r,s) using strides_in0
                int l_idx_a = t * (R * U * S) + u * S + r * (U * S) + s;
                C_expected[l_idx_c_exp] = A[l_idx_a];
            }
        }
    }

Then we would prepare the execution, by setting all arguments accordingly:

prepare arguments for execution

std::vector<mini_jit::dim_t> dim_types = {
    mini_jit::dim_t::c, // t
    mini_jit::dim_t::c, // r
    mini_jit::dim_t::c, // u
    mini_jit::dim_t::c  // s
};

std::vector<mini_jit::exec_t> exec_types = {
    mini_jit::exec_t::seq,  // t
    mini_jit::exec_t::seq,  // r
    mini_jit::exec_t::prim, // u
    mini_jit::exec_t::prim  // s
};

std::vector<int64_t> dim_sizes = {
    T, R, U, S};

std::vector<int64_t> strides_in0 = {
    R * U * S, // t
    U * S,     // r
    S,         // u
    1          // s
};

std::vector<int64_t> strides_in1 = {0, 0, 0, 0};

std::vector<int64_t> strides_out = {
    U * R * S, // t
    S,         // r
    R * S,     // u
    1          // s
};

Finally, we would execute our implementation.