5. Tensor Operation Backend

After experimenting with different neon implementations and developing kernels for our GEMM and BRGEMM, and most recently for the unary primitives, it is now time to combine all of these kernels together in a backend.

5.1 User Interface

The first thing that we need for our backend is a common entry point. Our common entry point is our setup function. Within the setup function we parse a number of configuration parameters, from which the corresponding kernels and primitives are constructed at runtime.

In our setup function, we check several things:

error handling for dimensions, execution types and data type
17/////////////////////////////////////////////////////////////////////
18// Check the number of dimensions
19/////////////////////////////////////////////////////////////////////
20if (dim_types.size() != dim_sizes.size() ||
21    dim_types.size() != strides_in0.size() ||
22    dim_types.size() != strides_in1.size() ||
23    dim_types.size() != strides_out.size())
24{
25    return error_t::wrong_dimension;
26}
27
28/////////////////////////////////////////////////////////////////////
29// Check the number of prim exec types
30/////////////////////////////////////////////////////////////////////
31int prim_count = std::count(exec_types.begin(), exec_types.end(), exec_t::prim);
32if (prim_main == ptype_t::brgemm && prim_count != 4)
33{
34    return error_t::wrong_exec_type;
35}
36else if (prim_main == ptype_t::gemm && prim_count != 3)
37{
38    return error_t::wrong_exec_type;
39}
40else if (prim_main == ptype_t::identity && prim_count != 2)
41{
42    return error_t::wrong_exec_type;
43}
44else if (prim_main == ptype_t::add || prim_main == ptype_t::sub || 
45         prim_main == ptype_t::mul || prim_main == ptype_t::div || 
46         prim_main == ptype_t::min || prim_main == ptype_t::max)
47{
48    if (prim_count != 2)
49    {
50        return error_t::wrong_exec_type;
51    }
52}
53
54/////////////////////////////////////////////////////////////////////
55// Check allowed data type
56/////////////////////////////////////////////////////////////////////
57if (dtype != dtype_t::fp32)
58{
59    return error_t::wrong_dtype;
60}
61
62/////////////////////////////////////////////////////////////////////
63// Check allowed primitive types
64/////////////////////////////////////////////////////////////////////
65std::vector<ptype_t> allowed_first_touch_types = {
66    ptype_t::none,
67    ptype_t::zero,
68    ptype_t::relu, 
assigning configuration parameters to member variables
70    ptype_t::reciprocal,
71    ptype_t::increment,
72    ptype_t::decrement
73};
74std::vector<ptype_t> allowed_main_types = {
75    ptype_t::none,
76    ptype_t::identity, 
77    ptype_t::brgemm, 
78    ptype_t::gemm,
79    ptype_t::add,
80    ptype_t::sub,
81    ptype_t::mul,
82    ptype_t::div,
83    ptype_t::min,
84    ptype_t::max
85};
86std::vector<ptype_t> allowed_last_touch_types = {
87    ptype_t::none,
88    ptype_t::relu, 
89    ptype_t::square, 
90    ptype_t::reciprocal,
find the first prim and seq position in exec_types
 92    ptype_t::decrement,
 93    ptype_t::fast_sigmoid,
 94    ptype_t::sigmoid_interp,
 95    ptype_t::sigmoid_taylor,
 96};
 97
 98if (std::find(allowed_first_touch_types.begin(), allowed_first_touch_types.end(), prim_first_touch) == allowed_first_touch_types.end())
 99{
100    return error_t::wrong_ptype;
101}
102if (std::find(allowed_main_types.begin(), allowed_main_types.end(), prim_main) == allowed_main_types.end())
103{
104    return error_t::wrong_ptype;
105}
106if (std::find(allowed_last_touch_types.begin(), allowed_last_touch_types.end(), prim_last_touch) == allowed_last_touch_types.end())
107{
108    return error_t::wrong_ptype;
109}
110
111/////////////////////////////////////////////////////////////////////
112// Assign member variables
113/////////////////////////////////////////////////////////////////////

We need to know the position of the first prim, to determine when we need to call the main kernel instead of recursively going deeper into the loop structure. In other words, we traverse the first sequential loops and as soon as we reach the first primary dimension, we start calling the main kernel.

assign the size of the prim dimensions according to the order in dim_types
127m_dim_id_seq_N = -1;
128m_dim_id_seq_K = -1;
129m_dim_id_sha_M = -1;
130m_dim_id_sha_N = -1;
131m_num_parallel_loops = 0;
132
133/////////////////////////////////////////////////////////////////////
134// Find first PRIM and SEQ dimensions in exec types
135/////////////////////////////////////////////////////////////////////
136auto it = std::find(exec_types.begin(), exec_types.end(), exec_t::prim);
137if (it != exec_types.end())
138{
139    m_id_first_primitive_loop = std::distance(exec_types.begin(), it);
140}
141else
142{
143    m_id_first_primitive_loop = 0;
144}
145
146it = std::find(exec_types.begin(), exec_types.end(), exec_t::seq);
147if (it != exec_types.end())
148{
149    m_id_first_seq_loop = std::distance(exec_types.begin(), it);
150}
151else
152{
153    m_id_first_seq_loop = -1;
assign the size of the seq and shared dimensions according to the order in dim_types
155/////////////////////////////////////////////////////////////////////
156// Find SHARED dimensions in exec types
157/////////////////////////////////////////////////////////////////////
158m_shared_loop_ids.clear();
159m_shared_loop_sizes.clear();
160for (size_t i = 0; i < m_exec_types.size(); ++i)
161{
162    if (m_exec_types[i] == exec_t::shared)
163    {
164        m_shared_loop_ids.push_back(i);
165        m_shared_loop_sizes.push_back(m_dim_sizes[i]);
166    }
167}
168
169/////////////////////////////////////////////////////////////////////
170// Read PRIM dimensions using dim types (No Copy)
171/////////////////////////////////////////////////////////////////////
172// convert to int so negative values are allowed
173int l_dim_types_size = static_cast<int>(m_dim_types.size());
174for (int i = l_dim_types_size - 1; i >= 0; i--)
175{
176    if (m_exec_types[i] == exec_t::prim)
177    {
178        if (m_dim_id_prim_M == -1 && m_dim_types[i] == dim_t::m)
179        {
180            m_dim_id_prim_M = i;
181        }
182        else if (m_dim_id_prim_N == -1 && m_dim_types[i] == dim_t::n)
183        {
184            m_dim_id_prim_N = i;
185        }
186        else if (m_dim_id_prim_K == -1 && m_dim_types[i] == dim_t::k)
187        {

After checking all these things, we were then able to create our kernels accordingly.

construct kernels based on assigned member variables
256// Check for Transposition
257/////////////////////////////////////////////////////////////////////
258if (m_dim_id_prim_M != -1)
259{
260    int64_t l_stride_in0 = m_strides_in0[m_dim_id_prim_M];
261    int64_t l_stride_out = m_strides_out[m_dim_id_prim_M];
262    // set transpose flag to true if the strides are different
263    m_transpose_output = l_stride_in0 != l_stride_out;
264}
265else
266{
267    // idk if we can check for transposition without M
268    m_transpose_output = false;
269}
270
271/////////////////////////////////////////////////////////////////////
272// Adjust strides based on primitive type and transposition
273/////////////////////////////////////////////////////////////////////
274if (prim_main == ptype_t::identity)
275{
276    if (!m_transpose_output)
277    {
278        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
279        m_adjusted_stride_in1 = 0;
280        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
281    }
282    else
283    {
284        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
285        m_adjusted_stride_in1 = 0;
286        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_M];
287    }
288}
289else if(prim_main == ptype_t::add || prim_main == ptype_t::sub ||
290        prim_main == ptype_t::mul || prim_main == ptype_t::div ||
291        prim_main == ptype_t::min || prim_main == ptype_t::max)
292{
293        m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
294        m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
295        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
296}
297else
298{
299    // GEMM & BRGEMM
300    m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
301    m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
302    m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
303}
304m_adjusted_br_size_A = m_dim_id_prim_BR != -1 ? m_strides_in0[m_dim_id_prim_BR] : 1;
305m_adjusted_br_size_B = m_dim_id_prim_BR != -1 ? m_strides_in1[m_dim_id_prim_BR] : 1;
306
307/////////////////////////////////////////////////////////////////////
308// Generate kernels
309/////////////////////////////////////////////////////////////////////
310if (prim_first_touch != ptype_t::none)
311{
312    // no transposition
313    m_unary_first_touch.generate(m_dim_sizes[m_dim_id_prim_M],
314                                 m_dim_sizes[m_dim_id_prim_N],
315                                 0,
316                                 dtype,
317                                 prim_first_touch);

5.2 Recursive Loops over Primitives

After constructing our kernels, we still needed to build together an execution function, in order to combine our main primitive with our first and last touches.

Our starting point is an execute function that takes the pointers to our matrices and passes them to our execute_iter function.

starting point: execute function
326                           1,
327                           0,
328                           0,
329                           0,
330                           dtype);
331    m_kernel_gemm_main = m_brgemm_main.get_kernel();
332}
333else if (prim_main == ptype_t::brgemm)
334{
335    // no transposition
336    m_brgemm_main.generate(m_dim_sizes[m_dim_id_prim_M],
337                           m_dim_sizes[m_dim_id_prim_N],
338                           m_dim_sizes[m_dim_id_prim_K],
339                           m_dim_sizes[m_dim_id_prim_BR],
340                           0,

The ‘real’ execution happens in the execute_iter function. We first check if the current iteration is the first or last access to a block in our output matrix. Next, we update the pointers to the matrices accordingly.

calculate if it is the first or last access in our output matrix and update pointers
367{
368    // no main kernel
369    m_kernel_gemm_main = nullptr;
370    m_kernel_binary_main = nullptr;
371    m_kernel_unary_main = nullptr;
372}
373
374if (prim_last_touch != ptype_t::none)
375{
376    // no transposition
377    m_unary_last_touch.generate(m_dim_sizes[m_dim_id_prim_M],
378                                m_dim_sizes[m_dim_id_prim_N],

In the following step, we use our execute_iter function to recursively call the execute_iter function based on how many seq dimensions exist in our exec_types.

recursive call to execute_iter
381                                prim_last_touch);
382    m_kernel_last_touch = m_unary_last_touch.get_kernel();
383}
384
385m_kernel_first_touch_type = prim_first_touch;
386m_kernel_main_type = prim_main;
387m_kernel_last_touch_type = prim_last_touch;
388
389m_has_been_setup = true;

If we have no further recursive call, we can execute the kernels.

execute the kernels
392}
393
394void mini_jit::TensorOperation::execute(void const *tensor_in0,
395                                        void const *tensor_in1,
396                                        void *tensor_out)
397{
398    if (!m_has_been_setup)
399    {
400        std::cerr << "TensorOperation has not been setup. Call setup() before execute()." << std::endl;
401        return;
402    }
403
404    auto ptr_in0 = static_cast<char const *>(tensor_in0);
405    auto ptr_in1 = static_cast<char const *>(tensor_in1);
406    auto ptr_out = static_cast<char *>(tensor_out);
407
408    if (m_num_parallel_loops == 0)
409    {
410        // No shared loops, execute sequentially
411        execute_iter(0,

5.3 Performance Benchmarking

To test the performance of our at runtime constructed kernels and to see if everything works seamlessly together, we were performing some reference benchmarks.

We were given a number of configuration parameters, that we should check:

Benchmark Configuration

Variable

1st Value

2nd Value

3rd Value

dtype

FP32

FP32

FP32

prim_first_touch

None

None

Zero

prim_main

GEMM

BRGEMM

BRGEMM

prim_last_touch

None

None

ReLU

dim_types

(M, N, K, M, N, K)

(M, N, K, M, N, K)

(M, N, K, M, N, K)

exec_types

(Seq, Seq, Seq, Prim, Prim, Prim)

(Seq, Seq, Prim, Prim, Prim, Prim)

(Seq, Seq, Prim, Prim, Prim, Prim)

dim_sizes

(32, 32, 8, 32, 32, 32)

(32, 32, 8, 32, 32, 32)

(32, 32, 8, 32, 32, 32)

strides_in0

(8192, 0, 1024, 1, 0, 32)

(8192, 0, 1024, 1, 0, 32)

(8192, 0, 1024, 1, 0, 32)

strides_in1

(0, 8192, 1024, 0, 32, 1)

(0, 8192, 1024, 0, 32, 1)

(0, 8192, 1024, 0, 32, 1)

strides_out

(32768, 1024, 0, 1, 32, 0)

(32768, 1024, 0, 1, 32, 0)

(32768, 1024, 0, 1, 32, 0)

However, when benchmarking our implementation with these strides, we were running into memory leaks. For that reason, we decided to adjust the strides of the benchmarks slightly.

Our Benchmark Configuration (bold are changes)

Variable

1st Value

2nd Value

3rd Value

dim_sizes

(32, 32, 8, 32, 32, 32)

(32, 32, 8, 32, 32, 32)

(32, 32, 8, 32, 32, 32)

strides_in0

(1024, 0, 32768, 1, 0, 32)

(1024, 0, 32768, 1, 0, 32)

(1024, 0, 32768, 1, 0, 32)

strides_in1

(0, 8192, 32, 0, 1024, 1)

(0, 8192, 32, 0, 1024, 1)

(0, 8192, 32, 0, 1024, 1)

strides_out

(32, 32768, 0, 1, 1024, 0)

(32, 32768, 0, 1, 1024, 0)

(32, 32768, 0, 1, 1024, 0)

When benchmarking our configurations we achieved the following GFLOP performance:

GFLOP performance of our benchmark configuration
 1Running TensorOperationBench benchmark #1
 2Total time (s):                  3.0052
 3Total reps:                      398
 4Total floating point operations: 213674622976
 5Estimated GFLOPS/sec:            71.1017
 6--------------------------------------------------
 7Running TensorOperationBench benchmark #2
 8Total time (s):                  3.0062
 9Total reps:                      413
10Total floating point operations: 221727686656
11Estimated GFLOPS/sec:            73.7568
12--------------------------------------------------
13Running TensorOperationBench benchmark #3
14Total time (s):                  3.00738
15Total reps:                      400
16Total floating point operations: 214748364800
17Estimated GFLOPS/sec:            71.4071
18--------------------------------------------------

The results show that we achieve between 71-73 GFLOPs for all our executions. These results are somewhat consistent with calling the kernels themselves independently.

Note

Since the submission we made some minor changes to our implementation. First, we fixed some errors and were then able to use the strides that we were provided with. Secondly, we decided to enhance our matmul_m_n_k implementation. Afterwards we able to calculate kernels of size 16x4 instead of 8x4. This helped us increase the results from 71-73 GFLOPs to around 90-91 GFLOPs.

GFLOP performance initial benchmark configuration with enhanced matmul kernel
 1Running TensorOperationBench benchmark #1
 2Total time (s):                  3.00343
 3Total reps:                      510
 4Total floating point operations: 273804165120
 5Estimated GFLOPS/sec:            91.1637
 6--------------------------------------------------
 7Running TensorOperationBench benchmark #2
 8Total time (s):                  3.00537
 9Total reps:                      514
10Total floating point operations: 275951648768
11Estimated GFLOPS/sec:            91.8195
12--------------------------------------------------
13Running TensorOperationBench benchmark #3
14Total time (s):                  3.00405
15Total reps:                      505
16Total floating point operations: 271119810560
17Estimated GFLOPS/sec:            90.2515
18--------------------------------------------------

5.4 Shared Memory Parallelization

To enable the execution of shared loops, we needed to make a few adjustments to our setup code:

gather shared loop id’s and dimension sizes
340                           0,
341                           0,
342                           0,
343                           dtype);
344    m_kernel_gemm_main = m_brgemm_main.get_kernel();
345}
346else if (prim_main == ptype_t::identity)
347{
348    m_unary_main.generate(m_dim_sizes[m_dim_id_prim_M],
assign the size of the shared dimensions according to the order in dim_types
175for (int i = l_dim_types_size - 1; i >= 0; i--)
176{
177    if (m_exec_types[i] == exec_t::prim)
178    {
179        if (m_dim_id_prim_M == -1 && m_dim_types[i] == dim_t::m)
180        {
181            m_dim_id_prim_M = i;
182        }
183        else if (m_dim_id_prim_N == -1 && m_dim_types[i] == dim_t::n)
184        {
185            m_dim_id_prim_N = i;
186        }
187        else if (m_dim_id_prim_K == -1 && m_dim_types[i] == dim_t::k)

In our execute function we would just needed to check if our m_num_parallel_loops variable would be greater than zero. If this was the case we would execute our execute_iter_parallel function:

multiply shared loop sizes to get total number of iterations
422                              ptr_in1,
423                              ptr_out,
424                              true,
425                              true);
426    }
427}

The idea is to get a flat iteration space that can be used to parallelize over.

We ‘unflatten’ the OpenMP iteration index l_it_all into a set of loop indices, one for each shared loop dimension. These indices are then used to compute the offsets for the in0, in1, and out tensors:

calculate the tensor offsets
434                                             bool last_access)
435{
436    // there is only one iteration if the dimension is the first primitive
437    const int64_t l_size = id_loop != m_id_first_primitive_loop ? m_dim_sizes[id_loop] : 1;
438    const int64_t dtype_sz = dtype_size();
439    const int64_t l_stride_in0 = m_strides_in0[id_loop] * dtype_sz;
440    const int64_t l_stride_in1 = m_strides_in1[id_loop] * dtype_sz;
441    const int64_t l_stride_out = m_strides_out[id_loop] * dtype_sz;
442
443    for (int64_t l_iter = 0; l_iter < l_size; l_iter++)
444    {
445        bool is_first = first_access;
446        bool is_last = last_access;
447        // if the size is 1, it is always the first and last access

Here we are calculating the offset for the current thread. Every shared loop contributes to the calculation with its corresponding stride.

Lastly, we call our execute_iter function. Depending on whether we have a seq dimension, we need to be careful, which id we pass to the function:

call remaining loops with execute_iter
461execute_iter(id_loop + 1,
462             sub_ptr_in0,
463             sub_ptr_in1,
464             sub_ptr_out,
465             is_first,
466             is_last);

We were also executing the benchmark configurations from the sequential execution task. We were executing our benchmarks using OMP_NUM_THREADS=4:

GFLOP performance for 4 shared loop execution
 1Running SharedTensorOperationBench benchmark #1
 2Total time (s):                  3.00107
 3Total reps:                      1993
 4Total floating point operations: 1069983727616
 5Estimated GFLOPS/sec:            356.534
 6--------------------------------------------------
 7Running SharedTensorOperationBench benchmark #2
 8Total time (s):                  3.00002
 9Total reps:                      2168
10Total floating point operations: 1163936137216
11Estimated GFLOPS/sec:            387.976
12--------------------------------------------------
13Running SharedTensorOperationBench benchmark #3
14Total time (s):                  3.00093
15Total reps:                      2126
16Total floating point operations: 1141387558912
17Estimated GFLOPS/sec:            380.345
18--------------------------------------------------

With the parallelization we achieve about 360 - 390 GFLOPs.

5.5 Optimization Passes

Our approach to enhancing the performance of the tensor operations was to use a vector of struct’s for each dimension that we have got:

call remaining loops with execute_iter
17struct Dimension
18{
19    //! Type of the dimension (M, N, K)
20    dim_t type = dim_t::m;
21    //! Execution type (Prim, Seq, Shared, ...)
22    exec_t exec_type = exec_t::undefined;
23    //! Dimension size
24    int64_t size = 0;
25    //! Stride in the first input tensor
26    int64_t stride_in0 = 0;
27    //! Stride in the second input tensor
28    int64_t stride_in1 = 0;
29    //! Stride in the output tensor
30    int64_t stride_out = 0;
31
32    /**
33     * @brief Construct a new Dimension object.
34     *
35     * @param type Type of the dimension (M, N, K).
36     * @param exec_type Execution type (Prim, Seq, Shared, ...).
37     * @param size Size of the dimension.
38     * @param stride_in0 Stride in the first input tensor.
39     * @param stride_in1 Stride in the second input tensor.
40     * @param stride_out Stride in the output tensor.
41     */
42    Dimension(dim_t type,
43              exec_t exec_type,
44              int64_t size,
45              int64_t stride_in0,
46              int64_t stride_in1,
47              int64_t stride_out)
48        : type(type),
49          exec_type(exec_type),
50          size(size),
51          stride_in0(stride_in0),
52          stride_in1(stride_in1),
53          stride_out(stride_out)
54    {
55        if (size <= 0)
56        {
57            throw std::invalid_argument("Dimension size needs to be greater than 0");
58        }
59    }
60};

This struct is used to store all information about a dimension.

After setting this up, we could create our optimization passes.

5.5.1 Primitive Identification

The first optimization that we performed was to find primitive dimensions. This optimization would be useful, for cases, where we were given only sequential loops. Our approach to this optimization was the following:

finding the K2 prim dimension for the BRGEMM case
209        }
210    }
211
212    // lastly, set all remaining dimensions to seq
213    for (auto &dim : dimensions)
214    {
215        if (dim.exec_type == exec_t::undefined)
216        {
217            dim.exec_type = exec_t::seq;
218        }
219    }
220
221    return; // all primary dimensions set
222}
223// BINARY CASE
224else if (!l_has_k_dim)
225{
226    // check for existing primary dimensions
227    int prim_count = std::count_if(dimensions.begin(), dimensions.end(),
228                                     [](const mini_jit::ir::Dimension &dim)
229                                     {
230                                         return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
231                                     });

We were trying to identify the respective dimensions by looking at the strides of the in1 and out tensors. As a starting point we use that for column-major BRGEMM’s we have to have a certain mask for our tensors ...M, ...K1 -> ...M, which we were trying to follow. This means that the K2 that we would need for our BRGEMM should not have any unit-stride in the first input tensor.

Similarly, we did the same for:

  • M dimension

  • N dimension

  • K1 dimension

For the N dimension, where we did not have any indication about which matrix to choose, we simply choose the N dimension, with the smallest stride:

finding the N prim dimension with smallest stride
261        std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
262    }
263}
264else
265{
266    throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
267}
268
269/////////////////////////////////////////////////////////////////
270// FIND PRIM N
271/////////////////////////////////////////////////////////////////
272// req: choose the one with smallest stride
273int l_n_dim_stride = INT_MAX;
274int l_n_dim_id = -1;
275for (size_t i = 0; i < dimensions.size(); i++)
276{
277    if (dimensions[i].type == dim_t::n &&
278        dimensions[i].stride_in0 == dimensions[i].stride_in1)
279    {

We did the ‘identification’ process in the order K2, M, N, K1. The reason for this order was that after the identification, we would rotate the respective dimension to the end of the order. This would then ultimately lead to the structure: ..., K2, M, N, K1 for our ‘identified’ primitive dimensions.

5.5.2 Dimension Splitting

For our second optimization pass we decided to look at the dimension sizes of our loops. We introduced a max_kernel_size parameter, which specifies the maximum allowed size for a dimension. If a dimension with a size larger than the maximum is found, the dimension splitter will try to split it into new dimensions with optimized sizes. The entry point for this optimization is the splitDimensions function:

splitDimensions function of the Optimizer
void mini_jit::ir::Optimizer::splitDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
                                              int64_t max_kernel_size)
{
    // Dimensions should be split if they are too large (> max_kernel_size)
    for (size_t i = 0; i < dimensions.size(); i++)
    {
        if (dimensions[i].size > max_kernel_size)
        {
            int64_t l_size_dim_0 = 0;
            int64_t l_size_dim_1 = 0;
            findBestSplit(dimensions[i].size,
                          max_kernel_size,
                          dimensions[i].type,
                          l_size_dim_0,
                          l_size_dim_1);
            if (l_size_dim_0 > 1)
            {
                // create a new seq dimension
                mini_jit::ir::Dimension l_dim_new(dimensions[i].type,
                                                  exec_t::seq,
                                                  l_size_dim_0,
                                                  dimensions[i].stride_in0 * l_size_dim_1,
                                                  dimensions[i].stride_in1 * l_size_dim_1,
                                                  dimensions[i].stride_out * l_size_dim_1);
                // update the original dimension size
                dimensions[i].size = l_size_dim_1;
                // insert the new dimension at the back, so it will be checked for a split again
                dimensions.push_back(l_dim_new);
            }
        }
    }
}

For each dimension, it finds the bets split for our kernels if the dimension size is too large and creates a new dimension. The size of the original dimension is updated to l_size_dim_1, and it will be smaller than or equal to max_kernel_size. However, the new dimension l_dim_new might still have a larger dimension size than max_kernel_size, which is why it is inserted at the end of the dimensions vector, where it will be checked for a possible split in a later iteration.

But what does findBestSplit do?

The way our kernels were implemented makes their execution more efficient for specific dimension sizes. Considering the M dimension, a size that is a multiple of 16 is optimal for most kernels, since we manually optimized the kernels for this case. As for the N dimension size, a multiple of 4 is optimal for most kernels. In the K dimension, we do not have such optimizations and the dimension size can be chosen freely, as long as it is smaller than max_kernel_size. The following code snippet shows the implementation of findBestSplit for the M and N dimensions:

findBestSplit function of the Optimizer for M and N
o_size_0 = 1;
o_size_1 = i_size;
if (i_type == dim_t::m)
{
    // multiples of (multiples of) 4 are efficient (LDP, STP)
    for (int64_t i = 16; i > 4; i -= 4)
    {
        findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, o_size_0, o_size_1);
        if (o_size_0 > 1)
        {
            return;
        }
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
}
// for n, we want multiples of 4
else if (i_type == dim_t::n)
{
    // split by 4
    findLargestMultipleOfDivisor(4, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
    if (o_size_0 > 1)
    {
        return;
    }
}

But what does findLargestMultipleOfDivisor do?

As the name suggests, this helper function tries to find the largest multiple of a given divisor. Let’s say the given divisor is 16, the input dimension size is 1600 and the i_max_kernel_size is 1024. Then, findLargestMultipleOfDivisor will try to find the largest multiple of 16 which divides 1600 and is smaller than or equal to 1024. The result of this computation is 2 for o_size_0 and 800 for o_size_1. For the more curious reader, the implementation of findLargestMultipleOfDivisor is given below:

findLargestMultipleOfDivisor function of the Optimizer
void mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
                                                           int64_t i_size,
                                                           int64_t i_max_size,
                                                           int64_t &o_size_0,
                                                           int64_t &o_size_1)
{
    if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_divisor > i_max_size)
    {
        return;
    }

    // start: largest multiple of i_divisor < i_max_size
    int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
    for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
    {
        // we found an m that divides i_size! it is also the largest
        if (i_size % l_m == 0)
        {
            o_size_0 = i_size / l_m;
            o_size_1 = l_m;
            return;
        }
    }
}

5.5.3 Shared Memory Parallelization

Our third optimization pass was to make all loops that were not a prim dimension and of the dimension-type M or N a shared loop. For that we initially check how many loops are already of dimension-type shared:

finding possible iterations for shared loops
364{
365    if (dimensions[i].type == dim_t::n &&
366        dimensions[i].stride_in0 == 0)
367    {
368        int l_current_strides = dimensions[i].stride_in1 + dimensions[i].stride_out;
369        if (l_current_strides < l_n_dim_strides)
370        {
371            l_n_dim_strides = l_current_strides;

For the case that we already have a high number of shared loops we do not create any more and simply return. Otherwise we check the seq dimensions for potential candidates:

select shared loop candidates
386if (l_n_dim_id != static_cast<int>(dimensions.size()) - 1)
387{
388    std::rotate(dimensions.begin() + l_n_dim_id, dimensions.begin() + l_n_dim_id + 1, dimensions.end());
389}
390/////////////////////////////////////////////////////////////////
391// FIND PRIM K
392/////////////////////////////////////////////////////////////////
393// req: unit stride in in1, stride_out has to be 0
394auto l_dim_k_it = std::find_if(dimensions.begin(), dimensions.end(),
395                               [](const mini_jit::ir::Dimension &dim)
396                               {
397                                   return dim.type == dim_t::k &&
398                                          dim.stride_in1 == 1 &&

As a last step we move all our shared loops to the front of the order:

move shared loops to the front
401if (l_dim_k_it != dimensions.end())
402{
403    // set dimension data
404    l_dim_k_it->type = dim_t::k;

5.5.4 Dimension Fusion

Note

This part of the Optimizer was implemented much later, as part of the 7.4 Progress of Week 2 of our final project phase.

The idea behind Dimension Fusion is that when certain dimensions have very small sizes, fusing them can improve cache efficiency and simplify tensor expressions. It also enables our existing dimension splitter to operate more effectively, as it can now split the fused dimensions in ways optimized for our kernels, rather than being constrained by the original tensor structure. In other words, Dimension Fusion will be the first step in our optimizer, simplifying the tensor expression upfront so it can then be split in an optimized way and finally, have its primitive dimensions identified.

The first step was to introduce a new min_kernel_size parameter. It allows the user to specify the minimum dimension size a kernel should have. If a dimension is smaller than that, the dimension fuser will try to look for candidates to fuse with. This process happens in the new fuseDimensions function of the Optimizer.

Dimension Fusing in the Optimizer
void mini_jit::ir::Optimizer::fuseDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
                                             int64_t min_kernel_size)
{
    for (size_t i = 0; i < dimensions.size(); i++)
    {
        mini_jit::ir::Dimension &l_dim_0 = dimensions[i];
        if (l_dim_0.size < min_kernel_size)
        {
            // find a dimension that can be fused with the current one
            for (size_t j = 0; j < dimensions.size(); j++)
            {
                if (i == j) continue; // skip self

                mini_jit::ir::Dimension &l_dim_1 = dimensions[j];
                if (l_dim_0.type == l_dim_1.type &&
                    (l_dim_0.exec_type == l_dim_1.exec_type ||
                    l_dim_0.exec_type == exec_t::undefined ||
                    l_dim_1.exec_type == exec_t::undefined) &&
                    l_dim_1.stride_in0 == l_dim_0.size * l_dim_0.stride_in0 &&
                    l_dim_1.stride_in1 == l_dim_0.size * l_dim_0.stride_in1 &&
                    l_dim_1.stride_out == l_dim_0.size * l_dim_0.stride_out)
                {
                    // fuse the two dimensions
                    l_dim_0.size *= l_dim_1.size;
                    // remove the fused dimension
                    dimensions.erase(dimensions.begin() + j);
                    j--; // adjust index after erasing
                }
            }
        }
    }
}

Here, l_dim_0 is the dimension whose size is smaller than min_kernel_size, meaning that we would like to fuse it with another candidate. However, the candidate (l_dim_1) the function looks for needs to fulfill some criteria:

  1. Same dimension type as l_dim_0 (M, N, K, C)

  2. Same execution type as l_dim_0, or either type is undefined

  3. The stride of l_dim_1 needs to equal the product of the stride and size of l_dim_0 (Two dimensions X and Y can be fused can be fused if for all tensors: stride(X) = |Y| ⨉ stride(Y))

If a fitting candidate has been found, l_dim_0 and l_dim_1 can be fused. This involves multiplying the dimension sizes and removing the candidate from the dimensions vector. The strides do not need to be adjusted, as the original stride of the small l_dim_0 is still correct.

After implementing dimension fusion, we also had to make adjustments to the dimension splitter. Previously, we would split dimensions by finding the largest possible split for one dimension. For example, if the given dimension size was 1600 and the maximum kernel size 1024, the function would have returned 2 for o_size_0 and 800 for o_size_1. This is because 800 is the largest multiple of 16 that is less than or equal to 1024. This was problematic however, because we then had a dimension of size 2, which was very small and could have lead to inefficiencies. Our solution to this problem was to also introduce the min_kernel_size parameter to the dimension splitter as well. Specifically, we adjusted the findBestSplit function, which now returns a split if the minimum_kernel_size is reached:

Updated findBestSplit function for M dimensions
if (i_type == dim_t::m)
{
    // multiples of (multiples of) 4 are efficient (LDP, STP)
    for (int64_t i = 16; i > 4; i -= 4)
    {
        findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
        if (o_size_0 >= i_min_kernel_size)
        {
            return;
        }
    }
    // split by 2
    findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
    if (o_size_0 >= i_min_kernel_size)
    {
        return;
    }
}

Consequently, findLargestMultipleOfDivisor had to be adjusted as well, with a simple if-condition:

Updated findLargestMultipleOfDivisor functionalities
void mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
                                                          int64_t i_size,
                                                          int64_t i_max_size,
                                                          int64_t i_min_size,
                                                          int64_t &o_size_0,
                                                          int64_t &o_size_1)
{
    if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_min_size <= 0 ||
        i_divisor > i_max_size || i_size < i_min_size)
    {
        return;
    }

    // start: largest multiple of i_divisor < i_max_size
    int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
    for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
    {
        // we found an m that divides i_size! it is also the largest
        if (i_size % l_m == 0)
        {
            int64_t candidate_size_0 = i_size / l_m;
            int64_t candidate_size_1 = l_m;
            if (candidate_size_0 >= i_min_size && candidate_size_1 >= i_min_size)
            {
                o_size_0 = candidate_size_0;
                o_size_1 = candidate_size_1;
                return;
            }
        }
    }
}

Candidates for splitting are now only chosen if both dimension sizes are at least as large as the specified minimum kernel size. Therefore, the new dimension splitter now outputs 50 and 32 as a split of 1600, if min_kernel_size is set to 16.

5.5.6 Performance Benchmarks

We also benchmarked the results for some configurations:

GFLOP performance for sample configurations
 1Running SharedTensorOperationBench benchmark #1
 2Total time (s):                  3.01164
 3Total reps:                      101
 4Total floating point operations: 827392000000
 5Estimated GFLOPS/sec:            274.731
 6--------------------------------------------------
 7#####################################################
 8Testing different kernel sizes
 9#####################################################
10Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 1024)
11Total time (s):                  3.02859
12Total reps:                      105
13Total floating point operations: 860160000000
14Estimated GFLOPS/sec:            284.013
15--------------------------------------------------
16Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 1024)
17Total time (s):                  3.02753
18Total reps:                      105
19Total floating point operations: 860160000000
20Estimated GFLOPS/sec:            284.113
21--------------------------------------------------
22Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 512)
23Total time (s):                  3.0037
24Total reps:                      95
25Total floating point operations: 778240000000
26Estimated GFLOPS/sec:            259.093
27--------------------------------------------------
28Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 512)
29Total time (s):                  3.00784
30Total reps:                      95
31Total floating point operations: 778240000000
32Estimated GFLOPS/sec:            258.738
33--------------------------------------------------
34Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 256)
35Total time (s):                  3.01035
36Total reps:                      112
37Total floating point operations: 917504000000
38Estimated GFLOPS/sec:            304.783
39--------------------------------------------------
40Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 256)
41Total time (s):                  3.00121
42Total reps:                      110
43Total floating point operations: 901120000000
44Estimated GFLOPS/sec:            300.253
45--------------------------------------------------
46Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 125)
47Total time (s):                  3.00583
48Total reps:                      129
49Total floating point operations: 1056768000000
50Estimated GFLOPS/sec:            351.573
51--------------------------------------------------
52Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 125)
53Total time (s):                  3.00655
54Total reps:                      129
55Total floating point operations: 1056768000000
56Estimated GFLOPS/sec:            351.488
57--------------------------------------------------
58Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 64)
59Total time (s):                  3.01405
60Total reps:                      119
61Total floating point operations: 974848000000
62Estimated GFLOPS/sec:            323.435
63--------------------------------------------------
64Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 64)
65Total time (s):                  3.01289
66Total reps:                      119
67Total floating point operations: 974848000000
68Estimated GFLOPS/sec:            323.559
69--------------------------------------------------
70Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 32)
71Total time (s):                  3.00815
72Total reps:                      125
73Total floating point operations: 1024000000000
74Estimated GFLOPS/sec:            340.409
75--------------------------------------------------
76Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 32)
77Total time (s):                  3.00952
78Total reps:                      126
79Total floating point operations: 1032192000000
80Estimated GFLOPS/sec:            342.975
81--------------------------------------------------
82Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 16)
83Total time (s):                  3.0137
84Total reps:                      40
85Total floating point operations: 327680000000
86Estimated GFLOPS/sec:            108.73
87--------------------------------------------------
88Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 16)
89Total time (s):                  3.01834
90Total reps:                      119
91Total floating point operations: 974848000000
92Estimated GFLOPS/sec:            322.975
93--------------------------------------------------

Depending on the selected dimensions our results varied massively. The highest performance we achieved was around 350 GFLOPs.

5.6 Unary Operations

After supporting the standard primitives, with GEMM and BRGEMM we would now allow also primitives like copy / identity, tranposition or permutation.

5.6.1 Backend Extension

The distinction to the other primitves here is, that all dimensions are of type dim_t::c. To allow this kind of primitives, we would have to make some small adjustments to our TensorOperation:

find M and N dimensions based on stride in the input
190        }
191        else if (m_dim_id_prim_K != -1 && m_dim_id_prim_BR == -1 && m_dim_types[i] == dim_t::k)
192        {
193            m_dim_id_prim_BR = i;
194        }
195    }
196}
197
198/////////////////////////////////////////////////////////////////////
199// Read SEQ and SHARED dimensions using dim types
200/////////////////////////////////////////////////////////////////////
201for (size_t i = 0; i < m_dim_types.size(); ++i)
202{
203    if (m_exec_types[i] == exec_t::seq)
204    {
205        if (m_dim_types[i] == dim_t::m)
206        {
207            m_dim_id_seq_M = i;
208        }
209        else if (m_dim_types[i] == dim_t::n)
210        {
generate identity primitive
295        m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
296}
297else
298{
299    // GEMM & BRGEMM
300    m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
301    m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
302    m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
303}

5.6.2 Optimization Passes

To run our optimization passes on these primitives, we would again have to make some adjustment, this time in our Optimizer.

For our identifyPrimitives function we would first check if we have a dim_t::c as a dimension type. If this was the case, we would continue to find the prim dimensions:

error handling for correct dimension types
73// Handle identity case first
74auto l_has_c_dim = std::any_of(dimensions.begin(), dimensions.end(),
75                               [](const mini_jit::ir::Dimension &dim)
76                               {
77                                   return dim.type == dim_t::c;
78                               });
79
80// Handle binary case
81auto l_has_k_dim = std::any_of(dimensions.begin(), dimensions.end(),
exit early, if all prim dimensions are already set
82                               [](const mini_jit::ir::Dimension &dim)
83                               {
84                                   return dim.type == dim_t::k;
85                               });
86
87if (l_has_c_dim)
88{
89    // check that all dimensions are c
90    if (!std::all_of(dimensions.begin(), dimensions.end(),
91                     [](const mini_jit::ir::Dimension &dim)
exit early, if all prim dimensions are already set
 97}
 98// check for existing primary dimensions
 99int prim_c_count = std::count_if(dimensions.begin(), dimensions.end(),
100                                 [](const mini_jit::ir::Dimension &dim)
101                                 {
102                                     return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
103                                 });
104if (prim_c_count == 2)
105{
106    return; // primary dimensions already set
107}
108else if (prim_c_count != 0)
109{
110    throw std::invalid_argument("Optimizer: Expected 0 or 2 primary dimensions of type 'c', found " + std::to_string(prim_c_count) + ". Try setting all dimensions to seq or undefined.");
111}
112
113/////////////////////////////////////////////////////////////////
114// FIND UNARY PRIM M
115/////////////////////////////////////////////////////////////////
116// req: unit stride in in0. Out might not be 1 if operation transposes.
117auto l_dim_m_it = std::find_if(dimensions.begin(), dimensions.end(),
118                               [](const mini_jit::ir::Dimension &dim)
119                               {
120                                   return (dim.type == dim_t::c) &&
121                                          dim.stride_in0 == 1 &&
122                                          dim.stride_in1 == 0;
123                               });
124
125bool l_transpose = false;
check for transposition and find dimensions accordingly
127{
128    // transpose if the output stride in M is not 1
129    l_transpose = l_dim_m_it->stride_out != 1;
130    // set dimension data
131    l_dim_m_it->exec_type = exec_t::prim;
132    if (l_dim_m_it != dimensions.end() - 1)
133    {
134        // move M to the back
135        std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
136    }
137}
138else
139{
140    throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
141}
142
143if (l_transpose)
144{
145    /////////////////////////////////////////////////////////////////
146    // FIND UNARY PRIM N
147    /////////////////////////////////////////////////////////////////
148    // req: unit stride in out.
149    auto l_dim_n_it = std::find_if(dimensions.begin(), dimensions.end(),
150                                   [](const mini_jit::ir::Dimension &dim)
151                                   {
152                                       return (dim.type == dim_t::c) &&
153                                              dim.stride_out == 1 &&
154                                              dim.stride_in1 == 0;
155                                   });

If we do not have a transposition we would simply look for the smallest stride and set the dimensions accordingly:

set dimensions for identity primitive
160        l_dim_n_it->exec_type = exec_t::prim;
161        if (l_dim_n_it != dimensions.end() - 1)
162        {
163            // move N to the back
164            std::rotate(l_dim_n_it, l_dim_n_it + 1, dimensions.end());
165        }
166    }
167    else
168    {
169        throw std::invalid_argument("Optimizer: No suitable primary dimension N found.");
170    }
171}
172else
173{
174    // TODO: check if this is ok
175
176    /////////////////////////////////////////////////////////////////
177    // FIND UNARY PRIM N
178    /////////////////////////////////////////////////////////////////
179    // req: choose the one with the smallest stride in in0
180    int l_n_dim_stride = INT_MAX;

The last step would be to set the remaining undefined dimensions to seq, as the next optimization would be to find ideal shared loops.

However, in our createSharedLoops function, we did not have to make any adjustments.

For our splitDimensions function, we would now also check if we had a dim_t::c as a dimension type:

split dimensions of type dim_t::c
465        // increase thread number for each existing shared dimension
466        l_num_threads *= dimensions[i].size;
467    }
468}
469
470if (l_num_threads >= thread_target)
471{
472    // make sure that the shared loops are at the front
473    std::stable_partition(dimensions.begin(), dimensions.end(),
474                          [](const mini_jit::ir::Dimension &dim)
475                          {
476                              return dim.exec_type == exec_t::shared;
477                          });
478    // no need to create more shared loops
479    return;
480}
481
482// Creation of new shared loops:
483for (size_t i = 0; i < dimensions.size(); i++)
484{
485    // if the dimension can be set to shared and we did not reach the target number of threads yet
486    // we set the dimension to shared
487    // also dont parallelize the k dimension (see class slides)
488    if ((dimensions[i].exec_type == exec_t::seq || dimensions[i].exec_type == exec_t::undefined) &&
489        dimensions[i].type != dim_t::k &&

5.6.3 Reference Implementation

For our reference implementation, we were using an example with 4 dimensions trus. We would change this trus order to turs.

initialize sizes for tensors
301{
302    const mini_jit::ptype_t first_touch_type = mini_jit::ptype_t::none;
303    const mini_jit::ptype_t main_type = mini_jit::ptype_t::identity;
304    const mini_jit::ptype_t last_touch_type = mini_jit::ptype_t::none;
305
306    const int T = GENERATE(3, 4, 7);
307    const int R = GENERATE(3, 4, 7);
308    const int U = GENERATE(3, 4, 7);
309    const int S = GENERATE(3, 4, 7);
310
311    const int SIZE = T * R * U * S;
312
313    float *A = new float[SIZE];
314    float *C = new float[SIZE];
315    float *C_expected = new float[SIZE];
316
317    std::random_device rd;
318    std::mt19937 gen(rd());
fill the tensors with values
320for (int i = 0; i < SIZE; ++i)
321{
322    A[i] = dist(gen);
323}
324
325// Compute C_expected
326for (int t = 0; t < T; ++t)
327{
328    for (int r = 0; r < R; ++r)
329    {
330        for (int u = 0; u < U; ++u)
331        {
332            for (int s = 0; s < S; ++s)
333            {
334                // Calculate index in output format (t,r,u,s) using strides_out
335                int l_idx_c_exp = t * (U * R * S) + r * S + u * (R * S) + s;
336                // Calculate index in input format (t,u,r,s) using strides_in0
337                int l_idx_a = t * (R * U * S) + u * S + r * (U * S) + s;
338                C_expected[l_idx_c_exp] = A[l_idx_a];
339            }
340        }
341    }

Then we would prepare the execution, by setting all arguments accordingly:

prepare arguments for execution
344std::vector<mini_jit::dim_t> dim_types = {
345    mini_jit::dim_t::c, // t
346    mini_jit::dim_t::c, // r
347    mini_jit::dim_t::c, // u
348    mini_jit::dim_t::c  // s
349};
350
351std::vector<mini_jit::exec_t> exec_types = {
352    mini_jit::exec_t::seq,  // t
353    mini_jit::exec_t::seq,  // r
354    mini_jit::exec_t::prim, // u
355    mini_jit::exec_t::prim  // s
356};
357
358std::vector<int64_t> dim_sizes = {
359    T, R, U, S};
360
361std::vector<int64_t> strides_in0 = {
362    R * U * S, // t
363    U * S,     // r
364    S,         // u
365    1          // s
366};
367
368std::vector<int64_t> strides_in1 = {0, 0, 0, 0};
369
370std::vector<int64_t> strides_out = {
371    U * R * S, // t
372    S,         // r
373    R * S,     // u
374    1          // s
375};

Finally, we would execute our implementation.