5. Tensor Operation Backend
After experimenting with different neon implementations and developing kernels for our GEMM and BRGEMM, and most recently for the unary primitives, it is now time to combine all of these kernels together in a backend.
5.1 User Interface
The first thing that we need for our backend is a common entry point.
Our common entry point is our setup
function. Within the setup function we
parse a number of configuration parameters, from which the corresponding kernels and
primitives are constructed at runtime.
In our setup function, we check several things:
17/////////////////////////////////////////////////////////////////////
18// Check the number of dimensions
19/////////////////////////////////////////////////////////////////////
20if (dim_types.size() != dim_sizes.size() ||
21 dim_types.size() != strides_in0.size() ||
22 dim_types.size() != strides_in1.size() ||
23 dim_types.size() != strides_out.size())
24{
25 return error_t::wrong_dimension;
26}
27
28/////////////////////////////////////////////////////////////////////
29// Check the number of prim exec types
30/////////////////////////////////////////////////////////////////////
31int prim_count = std::count(exec_types.begin(), exec_types.end(), exec_t::prim);
32if (prim_main == ptype_t::brgemm && prim_count != 4)
33{
34 return error_t::wrong_exec_type;
35}
36else if (prim_main == ptype_t::gemm && prim_count != 3)
37{
38 return error_t::wrong_exec_type;
39}
40else if (prim_main == ptype_t::identity && prim_count != 2)
41{
42 return error_t::wrong_exec_type;
43}
44else if (prim_main == ptype_t::add || prim_main == ptype_t::sub ||
45 prim_main == ptype_t::mul || prim_main == ptype_t::div ||
46 prim_main == ptype_t::min || prim_main == ptype_t::max)
47{
48 if (prim_count != 2)
49 {
50 return error_t::wrong_exec_type;
51 }
52}
53
54/////////////////////////////////////////////////////////////////////
55// Check allowed data type
56/////////////////////////////////////////////////////////////////////
57if (dtype != dtype_t::fp32)
58{
59 return error_t::wrong_dtype;
60}
61
62/////////////////////////////////////////////////////////////////////
63// Check allowed primitive types
64/////////////////////////////////////////////////////////////////////
65std::vector<ptype_t> allowed_first_touch_types = {
66 ptype_t::none,
67 ptype_t::zero,
68 ptype_t::relu,
70 ptype_t::reciprocal,
71 ptype_t::increment,
72 ptype_t::decrement
73};
74std::vector<ptype_t> allowed_main_types = {
75 ptype_t::none,
76 ptype_t::identity,
77 ptype_t::brgemm,
78 ptype_t::gemm,
79 ptype_t::add,
80 ptype_t::sub,
81 ptype_t::mul,
82 ptype_t::div,
83 ptype_t::min,
84 ptype_t::max
85};
86std::vector<ptype_t> allowed_last_touch_types = {
87 ptype_t::none,
88 ptype_t::relu,
89 ptype_t::square,
90 ptype_t::reciprocal,
prim
and seq
position in exec_types
92 ptype_t::decrement,
93 ptype_t::fast_sigmoid,
94 ptype_t::sigmoid_interp,
95 ptype_t::sigmoid_taylor,
96};
97
98if (std::find(allowed_first_touch_types.begin(), allowed_first_touch_types.end(), prim_first_touch) == allowed_first_touch_types.end())
99{
100 return error_t::wrong_ptype;
101}
102if (std::find(allowed_main_types.begin(), allowed_main_types.end(), prim_main) == allowed_main_types.end())
103{
104 return error_t::wrong_ptype;
105}
106if (std::find(allowed_last_touch_types.begin(), allowed_last_touch_types.end(), prim_last_touch) == allowed_last_touch_types.end())
107{
108 return error_t::wrong_ptype;
109}
110
111/////////////////////////////////////////////////////////////////////
112// Assign member variables
113/////////////////////////////////////////////////////////////////////
We need to know the position of the first prim, to determine when we need to call the main kernel instead of recursively going deeper into the loop structure. In other words, we traverse the first sequential loops and as soon as we reach the first primary dimension, we start calling the main kernel.
prim
dimensions according to the order in dim_types
127m_dim_id_seq_N = -1;
128m_dim_id_seq_K = -1;
129m_dim_id_sha_M = -1;
130m_dim_id_sha_N = -1;
131m_num_parallel_loops = 0;
132
133/////////////////////////////////////////////////////////////////////
134// Find first PRIM and SEQ dimensions in exec types
135/////////////////////////////////////////////////////////////////////
136auto it = std::find(exec_types.begin(), exec_types.end(), exec_t::prim);
137if (it != exec_types.end())
138{
139 m_id_first_primitive_loop = std::distance(exec_types.begin(), it);
140}
141else
142{
143 m_id_first_primitive_loop = 0;
144}
145
146it = std::find(exec_types.begin(), exec_types.end(), exec_t::seq);
147if (it != exec_types.end())
148{
149 m_id_first_seq_loop = std::distance(exec_types.begin(), it);
150}
151else
152{
153 m_id_first_seq_loop = -1;
seq
and shared
dimensions according to the order in dim_types
155/////////////////////////////////////////////////////////////////////
156// Find SHARED dimensions in exec types
157/////////////////////////////////////////////////////////////////////
158m_shared_loop_ids.clear();
159m_shared_loop_sizes.clear();
160for (size_t i = 0; i < m_exec_types.size(); ++i)
161{
162 if (m_exec_types[i] == exec_t::shared)
163 {
164 m_shared_loop_ids.push_back(i);
165 m_shared_loop_sizes.push_back(m_dim_sizes[i]);
166 }
167}
168
169/////////////////////////////////////////////////////////////////////
170// Read PRIM dimensions using dim types (No Copy)
171/////////////////////////////////////////////////////////////////////
172// convert to int so negative values are allowed
173int l_dim_types_size = static_cast<int>(m_dim_types.size());
174for (int i = l_dim_types_size - 1; i >= 0; i--)
175{
176 if (m_exec_types[i] == exec_t::prim)
177 {
178 if (m_dim_id_prim_M == -1 && m_dim_types[i] == dim_t::m)
179 {
180 m_dim_id_prim_M = i;
181 }
182 else if (m_dim_id_prim_N == -1 && m_dim_types[i] == dim_t::n)
183 {
184 m_dim_id_prim_N = i;
185 }
186 else if (m_dim_id_prim_K == -1 && m_dim_types[i] == dim_t::k)
187 {
After checking all these things, we were then able to create our kernels accordingly.
256// Check for Transposition
257/////////////////////////////////////////////////////////////////////
258if (m_dim_id_prim_M != -1)
259{
260 int64_t l_stride_in0 = m_strides_in0[m_dim_id_prim_M];
261 int64_t l_stride_out = m_strides_out[m_dim_id_prim_M];
262 // set transpose flag to true if the strides are different
263 m_transpose_output = l_stride_in0 != l_stride_out;
264}
265else
266{
267 // idk if we can check for transposition without M
268 m_transpose_output = false;
269}
270
271/////////////////////////////////////////////////////////////////////
272// Adjust strides based on primitive type and transposition
273/////////////////////////////////////////////////////////////////////
274if (prim_main == ptype_t::identity)
275{
276 if (!m_transpose_output)
277 {
278 m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
279 m_adjusted_stride_in1 = 0;
280 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
281 }
282 else
283 {
284 m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
285 m_adjusted_stride_in1 = 0;
286 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_M];
287 }
288}
289else if(prim_main == ptype_t::add || prim_main == ptype_t::sub ||
290 prim_main == ptype_t::mul || prim_main == ptype_t::div ||
291 prim_main == ptype_t::min || prim_main == ptype_t::max)
292{
293 m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_N];
294 m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
295 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
296}
297else
298{
299 // GEMM & BRGEMM
300 m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
301 m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
302 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
303}
304m_adjusted_br_size_A = m_dim_id_prim_BR != -1 ? m_strides_in0[m_dim_id_prim_BR] : 1;
305m_adjusted_br_size_B = m_dim_id_prim_BR != -1 ? m_strides_in1[m_dim_id_prim_BR] : 1;
306
307/////////////////////////////////////////////////////////////////////
308// Generate kernels
309/////////////////////////////////////////////////////////////////////
310if (prim_first_touch != ptype_t::none)
311{
312 // no transposition
313 m_unary_first_touch.generate(m_dim_sizes[m_dim_id_prim_M],
314 m_dim_sizes[m_dim_id_prim_N],
315 0,
316 dtype,
317 prim_first_touch);
5.2 Recursive Loops over Primitives
After constructing our kernels, we still needed to build together an execution
function,
in order to combine our main primitive with our first and last touches.
Our starting point is an execute
function that takes the pointers to our matrices and passes
them to our execute_iter
function.
execute
function326 1,
327 0,
328 0,
329 0,
330 dtype);
331 m_kernel_gemm_main = m_brgemm_main.get_kernel();
332}
333else if (prim_main == ptype_t::brgemm)
334{
335 // no transposition
336 m_brgemm_main.generate(m_dim_sizes[m_dim_id_prim_M],
337 m_dim_sizes[m_dim_id_prim_N],
338 m_dim_sizes[m_dim_id_prim_K],
339 m_dim_sizes[m_dim_id_prim_BR],
340 0,
The ‘real’ execution happens in the execute_iter
function. We first check if the current iteration is the first or last access to a block in our output matrix.
Next, we update the pointers to the matrices accordingly.
367{
368 // no main kernel
369 m_kernel_gemm_main = nullptr;
370 m_kernel_binary_main = nullptr;
371 m_kernel_unary_main = nullptr;
372}
373
374if (prim_last_touch != ptype_t::none)
375{
376 // no transposition
377 m_unary_last_touch.generate(m_dim_sizes[m_dim_id_prim_M],
378 m_dim_sizes[m_dim_id_prim_N],
In the following step, we use our execute_iter
function to recursively call the execute_iter
function based on how many seq
dimensions exist in our exec_types
.
execute_iter
381 prim_last_touch);
382 m_kernel_last_touch = m_unary_last_touch.get_kernel();
383}
384
385m_kernel_first_touch_type = prim_first_touch;
386m_kernel_main_type = prim_main;
387m_kernel_last_touch_type = prim_last_touch;
388
389m_has_been_setup = true;
If we have no further recursive call, we can execute the kernels.
392}
393
394void mini_jit::TensorOperation::execute(void const *tensor_in0,
395 void const *tensor_in1,
396 void *tensor_out)
397{
398 if (!m_has_been_setup)
399 {
400 std::cerr << "TensorOperation has not been setup. Call setup() before execute()." << std::endl;
401 return;
402 }
403
404 auto ptr_in0 = static_cast<char const *>(tensor_in0);
405 auto ptr_in1 = static_cast<char const *>(tensor_in1);
406 auto ptr_out = static_cast<char *>(tensor_out);
407
408 if (m_num_parallel_loops == 0)
409 {
410 // No shared loops, execute sequentially
411 execute_iter(0,
5.3 Performance Benchmarking
To test the performance of our at runtime constructed kernels and to see if everything works seamlessly together, we were performing some reference benchmarks.
We were given a number of configuration parameters, that we should check:
Variable |
1st Value |
2nd Value |
3rd Value |
---|---|---|---|
dtype |
FP32 |
FP32 |
FP32 |
prim_first_touch |
None |
None |
Zero |
prim_main |
GEMM |
BRGEMM |
BRGEMM |
prim_last_touch |
None |
None |
ReLU |
dim_types |
(M, N, K, M, N, K) |
(M, N, K, M, N, K) |
(M, N, K, M, N, K) |
exec_types |
(Seq, Seq, Seq, Prim, Prim, Prim) |
(Seq, Seq, Prim, Prim, Prim, Prim) |
(Seq, Seq, Prim, Prim, Prim, Prim) |
dim_sizes |
(32, 32, 8, 32, 32, 32) |
(32, 32, 8, 32, 32, 32) |
(32, 32, 8, 32, 32, 32) |
strides_in0 |
(8192, 0, 1024, 1, 0, 32) |
(8192, 0, 1024, 1, 0, 32) |
(8192, 0, 1024, 1, 0, 32) |
strides_in1 |
(0, 8192, 1024, 0, 32, 1) |
(0, 8192, 1024, 0, 32, 1) |
(0, 8192, 1024, 0, 32, 1) |
strides_out |
(32768, 1024, 0, 1, 32, 0) |
(32768, 1024, 0, 1, 32, 0) |
(32768, 1024, 0, 1, 32, 0) |
However, when benchmarking our implementation with these strides, we were running into memory leaks. For that reason, we decided to adjust the strides of the benchmarks slightly.
Variable |
1st Value |
2nd Value |
3rd Value |
---|---|---|---|
dim_sizes |
(32, 32, 8, 32, 32, 32) |
(32, 32, 8, 32, 32, 32) |
(32, 32, 8, 32, 32, 32) |
strides_in0 |
(1024, 0, 32768, 1, 0, 32) |
(1024, 0, 32768, 1, 0, 32) |
(1024, 0, 32768, 1, 0, 32) |
strides_in1 |
(0, 8192, 32, 0, 1024, 1) |
(0, 8192, 32, 0, 1024, 1) |
(0, 8192, 32, 0, 1024, 1) |
strides_out |
(32, 32768, 0, 1, 1024, 0) |
(32, 32768, 0, 1, 1024, 0) |
(32, 32768, 0, 1, 1024, 0) |
When benchmarking our configurations we achieved the following GFLOP
performance:
GFLOP
performance of our benchmark configuration 1Running TensorOperationBench benchmark #1
2Total time (s): 3.0052
3Total reps: 398
4Total floating point operations: 213674622976
5Estimated GFLOPS/sec: 71.1017
6--------------------------------------------------
7Running TensorOperationBench benchmark #2
8Total time (s): 3.0062
9Total reps: 413
10Total floating point operations: 221727686656
11Estimated GFLOPS/sec: 73.7568
12--------------------------------------------------
13Running TensorOperationBench benchmark #3
14Total time (s): 3.00738
15Total reps: 400
16Total floating point operations: 214748364800
17Estimated GFLOPS/sec: 71.4071
18--------------------------------------------------
The results show that we achieve between 71-73 GFLOPs
for all our executions.
These results are somewhat consistent with calling the kernels themselves independently.
Note
Since the submission we made some minor changes to our implementation.
First, we fixed some errors and were then able to use the strides that we were provided with.
Secondly, we decided to enhance our matmul_m_n_k
implementation.
Afterwards we able to calculate kernels of size 16x4
instead of 8x4
.
This helped us increase the results from 71-73 GFLOPs
to around 90-91 GFLOPs
.
GFLOP
performance initial benchmark configuration with enhanced matmul
kernel 1Running TensorOperationBench benchmark #1
2Total time (s): 3.00343
3Total reps: 510
4Total floating point operations: 273804165120
5Estimated GFLOPS/sec: 91.1637
6--------------------------------------------------
7Running TensorOperationBench benchmark #2
8Total time (s): 3.00537
9Total reps: 514
10Total floating point operations: 275951648768
11Estimated GFLOPS/sec: 91.8195
12--------------------------------------------------
13Running TensorOperationBench benchmark #3
14Total time (s): 3.00405
15Total reps: 505
16Total floating point operations: 271119810560
17Estimated GFLOPS/sec: 90.2515
18--------------------------------------------------
5.5 Optimization Passes
Our approach to enhancing the performance of the tensor operations was to use a vector of struct
’s for each dimension that we have got:
execute_iter
17struct Dimension
18{
19 //! Type of the dimension (M, N, K)
20 dim_t type = dim_t::m;
21 //! Execution type (Prim, Seq, Shared, ...)
22 exec_t exec_type = exec_t::undefined;
23 //! Dimension size
24 int64_t size = 0;
25 //! Stride in the first input tensor
26 int64_t stride_in0 = 0;
27 //! Stride in the second input tensor
28 int64_t stride_in1 = 0;
29 //! Stride in the output tensor
30 int64_t stride_out = 0;
31
32 /**
33 * @brief Construct a new Dimension object.
34 *
35 * @param type Type of the dimension (M, N, K).
36 * @param exec_type Execution type (Prim, Seq, Shared, ...).
37 * @param size Size of the dimension.
38 * @param stride_in0 Stride in the first input tensor.
39 * @param stride_in1 Stride in the second input tensor.
40 * @param stride_out Stride in the output tensor.
41 */
42 Dimension(dim_t type,
43 exec_t exec_type,
44 int64_t size,
45 int64_t stride_in0,
46 int64_t stride_in1,
47 int64_t stride_out)
48 : type(type),
49 exec_type(exec_type),
50 size(size),
51 stride_in0(stride_in0),
52 stride_in1(stride_in1),
53 stride_out(stride_out)
54 {
55 if (size <= 0)
56 {
57 throw std::invalid_argument("Dimension size needs to be greater than 0");
58 }
59 }
60};
This struct
is used to store all information about a dimension.
After setting this up, we could create our optimization passes.
5.5.1 Primitive Identification
The first optimization that we performed was to find primitive dimensions. This optimization would be useful, for cases, where we were given only sequential loops. Our approach to this optimization was the following:
K2 prim
dimension for the BRGEMM
case209 }
210 }
211
212 // lastly, set all remaining dimensions to seq
213 for (auto &dim : dimensions)
214 {
215 if (dim.exec_type == exec_t::undefined)
216 {
217 dim.exec_type = exec_t::seq;
218 }
219 }
220
221 return; // all primary dimensions set
222}
223// BINARY CASE
224else if (!l_has_k_dim)
225{
226 // check for existing primary dimensions
227 int prim_count = std::count_if(dimensions.begin(), dimensions.end(),
228 [](const mini_jit::ir::Dimension &dim)
229 {
230 return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
231 });
We were trying to identify the respective dimensions by looking at the strides of the in1
and out
tensors.
As a starting point we use that for column-major BRGEMM
’s we have to have a certain mask for our tensors ...M, ...K1 -> ...M
, which we were trying to follow.
This means that the K2
that we would need for our BRGEMM
should not have any unit-stride in the first input tensor.
Similarly, we did the same for:
M
dimensionN
dimensionK1
dimension
For the N
dimension, where we did not have any indication about which matrix to choose, we simply choose the N
dimension, with the smallest stride:
N prim
dimension with smallest stride261 std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
262 }
263}
264else
265{
266 throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
267}
268
269/////////////////////////////////////////////////////////////////
270// FIND PRIM N
271/////////////////////////////////////////////////////////////////
272// req: choose the one with smallest stride
273int l_n_dim_stride = INT_MAX;
274int l_n_dim_id = -1;
275for (size_t i = 0; i < dimensions.size(); i++)
276{
277 if (dimensions[i].type == dim_t::n &&
278 dimensions[i].stride_in0 == dimensions[i].stride_in1)
279 {
We did the ‘identification’ process in the order K2, M, N, K1
.
The reason for this order was that after the identification, we would rotate the respective dimension to the end of the order.
This would then ultimately lead to the structure: ..., K2, M, N, K1
for our ‘identified’ primitive dimensions.
5.5.2 Dimension Splitting
For our second optimization pass we decided to look at the dimension sizes of our loops. We introduced a max_kernel_size
parameter,
which specifies the maximum allowed size for a dimension. If a dimension with a size larger than the maximum is found, the dimension splitter
will try to split it into new dimensions with optimized sizes. The entry point for this optimization is the splitDimensions
function:
void mini_jit::ir::Optimizer::splitDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
int64_t max_kernel_size)
{
// Dimensions should be split if they are too large (> max_kernel_size)
for (size_t i = 0; i < dimensions.size(); i++)
{
if (dimensions[i].size > max_kernel_size)
{
int64_t l_size_dim_0 = 0;
int64_t l_size_dim_1 = 0;
findBestSplit(dimensions[i].size,
max_kernel_size,
dimensions[i].type,
l_size_dim_0,
l_size_dim_1);
if (l_size_dim_0 > 1)
{
// create a new seq dimension
mini_jit::ir::Dimension l_dim_new(dimensions[i].type,
exec_t::seq,
l_size_dim_0,
dimensions[i].stride_in0 * l_size_dim_1,
dimensions[i].stride_in1 * l_size_dim_1,
dimensions[i].stride_out * l_size_dim_1);
// update the original dimension size
dimensions[i].size = l_size_dim_1;
// insert the new dimension at the back, so it will be checked for a split again
dimensions.push_back(l_dim_new);
}
}
}
}
For each dimension, it finds the bets split for our kernels if the dimension size is too large and creates a new dimension. The size of the original dimension is updated to l_size_dim_1
, and it will be smaller than or equal to max_kernel_size
. However, the new dimension l_dim_new
might still have a larger dimension size than max_kernel_size
, which is why it is inserted at the end of the dimensions vector, where it will be checked for a possible split in a later iteration.
But what does findBestSplit
do?
The way our kernels were implemented makes their execution more efficient for specific dimension sizes. Considering the M dimension, a size that is a multiple of 16 is optimal for most kernels, since we manually optimized the kernels for this case. As for the N dimension size, a multiple of 4 is optimal for most kernels.
In the K dimension, we do not have such optimizations and the dimension size can be chosen freely, as long as it is smaller than max_kernel_size
.
The following code snippet shows the implementation of findBestSplit
for the M and N dimensions:
o_size_0 = 1;
o_size_1 = i_size;
if (i_type == dim_t::m)
{
// multiples of (multiples of) 4 are efficient (LDP, STP)
for (int64_t i = 16; i > 4; i -= 4)
{
findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, o_size_0, o_size_1);
if (o_size_0 > 1)
{
return;
}
}
// split by 2
findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
if (o_size_0 > 1)
{
return;
}
}
// for n, we want multiples of 4
else if (i_type == dim_t::n)
{
// split by 4
findLargestMultipleOfDivisor(4, i_size, i_max_kernel_size, o_size_0, o_size_1);
if (o_size_0 > 1)
{
return;
}
// split by 2
findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, o_size_0, o_size_1);
if (o_size_0 > 1)
{
return;
}
}
But what does findLargestMultipleOfDivisor
do?
As the name suggests, this helper function tries to find the largest multiple of a given divisor. Let’s say the given divisor is 16
, the input dimension size is 1600 and the i_max_kernel_size
is 1024.
Then, findLargestMultipleOfDivisor
will try to find the largest multiple of 16 which divides 1600 and is smaller than or equal to 1024. The result of this computation is 2 for o_size_0
and 800 for o_size_1
.
For the more curious reader, the implementation of findLargestMultipleOfDivisor
is given below:
findLargestMultipleOfDivisor
function of the Optimizervoid mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
int64_t i_size,
int64_t i_max_size,
int64_t &o_size_0,
int64_t &o_size_1)
{
if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_divisor > i_max_size)
{
return;
}
// start: largest multiple of i_divisor < i_max_size
int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
{
// we found an m that divides i_size! it is also the largest
if (i_size % l_m == 0)
{
o_size_0 = i_size / l_m;
o_size_1 = l_m;
return;
}
}
}
5.5.3 Shared Memory Parallelization
Our third optimization pass was to make all loops that were not a prim
dimension and of the dimension-type M
or N
a shared
loop.
For that we initially check how many loops are already of dimension-type shared
:
364{
365 if (dimensions[i].type == dim_t::n &&
366 dimensions[i].stride_in0 == 0)
367 {
368 int l_current_strides = dimensions[i].stride_in1 + dimensions[i].stride_out;
369 if (l_current_strides < l_n_dim_strides)
370 {
371 l_n_dim_strides = l_current_strides;
For the case that we already have a high number of shared
loops we do not create any more and simply return.
Otherwise we check the seq
dimensions for potential candidates:
shared
loop candidates386if (l_n_dim_id != static_cast<int>(dimensions.size()) - 1)
387{
388 std::rotate(dimensions.begin() + l_n_dim_id, dimensions.begin() + l_n_dim_id + 1, dimensions.end());
389}
390/////////////////////////////////////////////////////////////////
391// FIND PRIM K
392/////////////////////////////////////////////////////////////////
393// req: unit stride in in1, stride_out has to be 0
394auto l_dim_k_it = std::find_if(dimensions.begin(), dimensions.end(),
395 [](const mini_jit::ir::Dimension &dim)
396 {
397 return dim.type == dim_t::k &&
398 dim.stride_in1 == 1 &&
As a last step we move all our shared
loops to the front of the order:
shared
loops to the front401if (l_dim_k_it != dimensions.end())
402{
403 // set dimension data
404 l_dim_k_it->type = dim_t::k;
5.5.4 Dimension Fusion
Note
This part of the Optimizer was implemented much later, as part of the 7.4 Progress of Week 2 of our final project phase.
The idea behind Dimension Fusion is that when certain dimensions have very small sizes, fusing them can improve cache efficiency and simplify tensor expressions. It also enables our existing dimension splitter to operate more effectively, as it can now split the fused dimensions in ways optimized for our kernels, rather than being constrained by the original tensor structure. In other words, Dimension Fusion will be the first step in our optimizer, simplifying the tensor expression upfront so it can then be split in an optimized way and finally, have its primitive dimensions identified.
The first step was to introduce a new min_kernel_size
parameter. It allows the user to specify the minimum dimension size a kernel should have. If a dimension is smaller than that, the dimension fuser will try to look for candidates to fuse with. This process happens in the new fuseDimensions
function of the Optimizer.
void mini_jit::ir::Optimizer::fuseDimensions(std::vector<mini_jit::ir::Dimension> &dimensions,
int64_t min_kernel_size)
{
for (size_t i = 0; i < dimensions.size(); i++)
{
mini_jit::ir::Dimension &l_dim_0 = dimensions[i];
if (l_dim_0.size < min_kernel_size)
{
// find a dimension that can be fused with the current one
for (size_t j = 0; j < dimensions.size(); j++)
{
if (i == j) continue; // skip self
mini_jit::ir::Dimension &l_dim_1 = dimensions[j];
if (l_dim_0.type == l_dim_1.type &&
(l_dim_0.exec_type == l_dim_1.exec_type ||
l_dim_0.exec_type == exec_t::undefined ||
l_dim_1.exec_type == exec_t::undefined) &&
l_dim_1.stride_in0 == l_dim_0.size * l_dim_0.stride_in0 &&
l_dim_1.stride_in1 == l_dim_0.size * l_dim_0.stride_in1 &&
l_dim_1.stride_out == l_dim_0.size * l_dim_0.stride_out)
{
// fuse the two dimensions
l_dim_0.size *= l_dim_1.size;
// remove the fused dimension
dimensions.erase(dimensions.begin() + j);
j--; // adjust index after erasing
}
}
}
}
}
Here, l_dim_0
is the dimension whose size is smaller than min_kernel_size
, meaning that we would like to fuse it with another candidate. However, the candidate (l_dim_1
) the function looks for needs to fulfill some criteria:
Same dimension type as
l_dim_0
(M, N, K, C)Same execution type as
l_dim_0
, or either type is undefinedThe stride of
l_dim_1
needs to equal the product of the stride and size ofl_dim_0
(Two dimensions X and Y can be fused can be fused if for all tensors: stride(X) = |Y| ⨉ stride(Y))
If a fitting candidate has been found, l_dim_0
and l_dim_1
can be fused. This involves multiplying the dimension sizes and removing the candidate from the dimensions vector. The strides do not need to be adjusted, as the original stride of the small l_dim_0
is still correct.
After implementing dimension fusion, we also had to make adjustments to the dimension splitter. Previously, we would split dimensions by finding the largest possible split for one dimension. For example, if the given dimension size was 1600 and the maximum kernel size 1024, the function would have returned 2 for o_size_0
and 800 for o_size_1
. This is because 800 is the largest multiple of 16 that is less than or equal to 1024. This was problematic however, because we then had a dimension of size 2, which was very small and could have lead to inefficiencies. Our solution to this problem was to also introduce the min_kernel_size
parameter to the dimension splitter as well. Specifically, we adjusted the findBestSplit
function, which now returns a split if the minimum_kernel_size
is reached:
if (i_type == dim_t::m)
{
// multiples of (multiples of) 4 are efficient (LDP, STP)
for (int64_t i = 16; i > 4; i -= 4)
{
findLargestMultipleOfDivisor(i, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
if (o_size_0 >= i_min_kernel_size)
{
return;
}
}
// split by 2
findLargestMultipleOfDivisor(2, i_size, i_max_kernel_size, i_min_kernel_size, o_size_0, o_size_1);
if (o_size_0 >= i_min_kernel_size)
{
return;
}
}
Consequently, findLargestMultipleOfDivisor
had to be adjusted as well, with a simple if-condition:
void mini_jit::ir::Optimizer::findLargestMultipleOfDivisor(int64_t i_divisor,
int64_t i_size,
int64_t i_max_size,
int64_t i_min_size,
int64_t &o_size_0,
int64_t &o_size_1)
{
if (i_divisor <= 0 || i_size <= 0 || i_max_size <= 0 || i_min_size <= 0 ||
i_divisor > i_max_size || i_size < i_min_size)
{
return;
}
// start: largest multiple of i_divisor < i_max_size
int64_t l_max_divisible = (i_max_size / i_divisor) * i_divisor;
for (int64_t l_m = l_max_divisible; l_m >= i_divisor; l_m -= i_divisor)
{
// we found an m that divides i_size! it is also the largest
if (i_size % l_m == 0)
{
int64_t candidate_size_0 = i_size / l_m;
int64_t candidate_size_1 = l_m;
if (candidate_size_0 >= i_min_size && candidate_size_1 >= i_min_size)
{
o_size_0 = candidate_size_0;
o_size_1 = candidate_size_1;
return;
}
}
}
}
Candidates for splitting are now only chosen if both dimension sizes are at least as large as the specified minimum kernel size.
Therefore, the new dimension splitter now outputs 50 and 32 as a split of 1600, if min_kernel_size
is set to 16.
5.5.6 Performance Benchmarks
We also benchmarked the results for some configurations:
GFLOP
performance for sample configurations 1Running SharedTensorOperationBench benchmark #1
2Total time (s): 3.01164
3Total reps: 101
4Total floating point operations: 827392000000
5Estimated GFLOPS/sec: 274.731
6--------------------------------------------------
7#####################################################
8Testing different kernel sizes
9#####################################################
10Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 1024)
11Total time (s): 3.02859
12Total reps: 105
13Total floating point operations: 860160000000
14Estimated GFLOPS/sec: 284.013
15--------------------------------------------------
16Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 1024)
17Total time (s): 3.02753
18Total reps: 105
19Total floating point operations: 860160000000
20Estimated GFLOPS/sec: 284.113
21--------------------------------------------------
22Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 512)
23Total time (s): 3.0037
24Total reps: 95
25Total floating point operations: 778240000000
26Estimated GFLOPS/sec: 259.093
27--------------------------------------------------
28Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 512)
29Total time (s): 3.00784
30Total reps: 95
31Total floating point operations: 778240000000
32Estimated GFLOPS/sec: 258.738
33--------------------------------------------------
34Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 256)
35Total time (s): 3.01035
36Total reps: 112
37Total floating point operations: 917504000000
38Estimated GFLOPS/sec: 304.783
39--------------------------------------------------
40Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 256)
41Total time (s): 3.00121
42Total reps: 110
43Total floating point operations: 901120000000
44Estimated GFLOPS/sec: 300.253
45--------------------------------------------------
46Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 125)
47Total time (s): 3.00583
48Total reps: 129
49Total floating point operations: 1056768000000
50Estimated GFLOPS/sec: 351.573
51--------------------------------------------------
52Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 125)
53Total time (s): 3.00655
54Total reps: 129
55Total floating point operations: 1056768000000
56Estimated GFLOPS/sec: 351.488
57--------------------------------------------------
58Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 64)
59Total time (s): 3.01405
60Total reps: 119
61Total floating point operations: 974848000000
62Estimated GFLOPS/sec: 323.435
63--------------------------------------------------
64Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 64)
65Total time (s): 3.01289
66Total reps: 119
67Total floating point operations: 974848000000
68Estimated GFLOPS/sec: 323.559
69--------------------------------------------------
70Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 32)
71Total time (s): 3.00815
72Total reps: 125
73Total floating point operations: 1024000000000
74Estimated GFLOPS/sec: 340.409
75--------------------------------------------------
76Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 32)
77Total time (s): 3.00952
78Total reps: 126
79Total floating point operations: 1032192000000
80Estimated GFLOPS/sec: 342.975
81--------------------------------------------------
82Running SharedTensorOperationBench benchmark (thread_target: 64, max_kernel_size: 16)
83Total time (s): 3.0137
84Total reps: 40
85Total floating point operations: 327680000000
86Estimated GFLOPS/sec: 108.73
87--------------------------------------------------
88Running SharedTensorOperationBench benchmark (thread_target: 256, max_kernel_size: 16)
89Total time (s): 3.01834
90Total reps: 119
91Total floating point operations: 974848000000
92Estimated GFLOPS/sec: 322.975
93--------------------------------------------------
Depending on the selected dimensions our results varied massively. The highest performance we achieved was around 350 GFLOPs
.
5.6 Unary Operations
After supporting the standard primitives, with GEMM
and BRGEMM
we would now allow also primitives like
copy
/ identity
, tranposition
or permutation
.
5.6.1 Backend Extension
The distinction to the other primitves here is, that all dimensions are of type dim_t::c
.
To allow this kind of primitives, we would have to make some small adjustments to our TensorOperation
:
M
and N
dimensions based on stride in the input190 }
191 else if (m_dim_id_prim_K != -1 && m_dim_id_prim_BR == -1 && m_dim_types[i] == dim_t::k)
192 {
193 m_dim_id_prim_BR = i;
194 }
195 }
196}
197
198/////////////////////////////////////////////////////////////////////
199// Read SEQ and SHARED dimensions using dim types
200/////////////////////////////////////////////////////////////////////
201for (size_t i = 0; i < m_dim_types.size(); ++i)
202{
203 if (m_exec_types[i] == exec_t::seq)
204 {
205 if (m_dim_types[i] == dim_t::m)
206 {
207 m_dim_id_seq_M = i;
208 }
209 else if (m_dim_types[i] == dim_t::n)
210 {
identity
primitive295 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
296}
297else
298{
299 // GEMM & BRGEMM
300 m_adjusted_stride_in0 = m_strides_in0[m_dim_id_prim_K];
301 m_adjusted_stride_in1 = m_strides_in1[m_dim_id_prim_N];
302 m_adjusted_stride_out = m_strides_out[m_dim_id_prim_N];
303}
5.6.2 Optimization Passes
To run our optimization passes on these primitives, we would again have to make some adjustment, this time in our Optimizer
.
For our identifyPrimitives
function we would first check if we have a dim_t::c
as a dimension type.
If this was the case, we would continue to find the prim
dimensions:
73// Handle identity case first
74auto l_has_c_dim = std::any_of(dimensions.begin(), dimensions.end(),
75 [](const mini_jit::ir::Dimension &dim)
76 {
77 return dim.type == dim_t::c;
78 });
79
80// Handle binary case
81auto l_has_k_dim = std::any_of(dimensions.begin(), dimensions.end(),
prim
dimensions are already set82 [](const mini_jit::ir::Dimension &dim)
83 {
84 return dim.type == dim_t::k;
85 });
86
87if (l_has_c_dim)
88{
89 // check that all dimensions are c
90 if (!std::all_of(dimensions.begin(), dimensions.end(),
91 [](const mini_jit::ir::Dimension &dim)
prim
dimensions are already set 97}
98// check for existing primary dimensions
99int prim_c_count = std::count_if(dimensions.begin(), dimensions.end(),
100 [](const mini_jit::ir::Dimension &dim)
101 {
102 return dim.type == dim_t::c && dim.exec_type == exec_t::prim;
103 });
104if (prim_c_count == 2)
105{
106 return; // primary dimensions already set
107}
108else if (prim_c_count != 0)
109{
110 throw std::invalid_argument("Optimizer: Expected 0 or 2 primary dimensions of type 'c', found " + std::to_string(prim_c_count) + ". Try setting all dimensions to seq or undefined.");
111}
112
113/////////////////////////////////////////////////////////////////
114// FIND UNARY PRIM M
115/////////////////////////////////////////////////////////////////
116// req: unit stride in in0. Out might not be 1 if operation transposes.
117auto l_dim_m_it = std::find_if(dimensions.begin(), dimensions.end(),
118 [](const mini_jit::ir::Dimension &dim)
119 {
120 return (dim.type == dim_t::c) &&
121 dim.stride_in0 == 1 &&
122 dim.stride_in1 == 0;
123 });
124
125bool l_transpose = false;
transposition
and find dimensions accordingly127{
128 // transpose if the output stride in M is not 1
129 l_transpose = l_dim_m_it->stride_out != 1;
130 // set dimension data
131 l_dim_m_it->exec_type = exec_t::prim;
132 if (l_dim_m_it != dimensions.end() - 1)
133 {
134 // move M to the back
135 std::rotate(l_dim_m_it, l_dim_m_it + 1, dimensions.end());
136 }
137}
138else
139{
140 throw std::invalid_argument("Optimizer: No suitable primary dimension M found.");
141}
142
143if (l_transpose)
144{
145 /////////////////////////////////////////////////////////////////
146 // FIND UNARY PRIM N
147 /////////////////////////////////////////////////////////////////
148 // req: unit stride in out.
149 auto l_dim_n_it = std::find_if(dimensions.begin(), dimensions.end(),
150 [](const mini_jit::ir::Dimension &dim)
151 {
152 return (dim.type == dim_t::c) &&
153 dim.stride_out == 1 &&
154 dim.stride_in1 == 0;
155 });
If we do not have a transposition
we would simply look for the smallest stride and set the dimensions accordingly:
identity
primitive160 l_dim_n_it->exec_type = exec_t::prim;
161 if (l_dim_n_it != dimensions.end() - 1)
162 {
163 // move N to the back
164 std::rotate(l_dim_n_it, l_dim_n_it + 1, dimensions.end());
165 }
166 }
167 else
168 {
169 throw std::invalid_argument("Optimizer: No suitable primary dimension N found.");
170 }
171}
172else
173{
174 // TODO: check if this is ok
175
176 /////////////////////////////////////////////////////////////////
177 // FIND UNARY PRIM N
178 /////////////////////////////////////////////////////////////////
179 // req: choose the one with the smallest stride in in0
180 int l_n_dim_stride = INT_MAX;
The last step would be to set the remaining undefined dimensions to seq
, as the next optimization would be to find ideal shared
loops.
However, in our createSharedLoops
function, we did not have to make any adjustments.
For our splitDimensions
function, we would now also check if we had a dim_t::c
as a dimension type:
dim_t::c
465 // increase thread number for each existing shared dimension
466 l_num_threads *= dimensions[i].size;
467 }
468}
469
470if (l_num_threads >= thread_target)
471{
472 // make sure that the shared loops are at the front
473 std::stable_partition(dimensions.begin(), dimensions.end(),
474 [](const mini_jit::ir::Dimension &dim)
475 {
476 return dim.exec_type == exec_t::shared;
477 });
478 // no need to create more shared loops
479 return;
480}
481
482// Creation of new shared loops:
483for (size_t i = 0; i < dimensions.size(); i++)
484{
485 // if the dimension can be set to shared and we did not reach the target number of threads yet
486 // we set the dimension to shared
487 // also dont parallelize the k dimension (see class slides)
488 if ((dimensions[i].exec_type == exec_t::seq || dimensions[i].exec_type == exec_t::undefined) &&
489 dimensions[i].type != dim_t::k &&
5.6.3 Reference Implementation
For our reference implementation, we were using an example with 4 dimensions trus
.
We would change this trus
order to turs
.
301{
302 const mini_jit::ptype_t first_touch_type = mini_jit::ptype_t::none;
303 const mini_jit::ptype_t main_type = mini_jit::ptype_t::identity;
304 const mini_jit::ptype_t last_touch_type = mini_jit::ptype_t::none;
305
306 const int T = GENERATE(3, 4, 7);
307 const int R = GENERATE(3, 4, 7);
308 const int U = GENERATE(3, 4, 7);
309 const int S = GENERATE(3, 4, 7);
310
311 const int SIZE = T * R * U * S;
312
313 float *A = new float[SIZE];
314 float *C = new float[SIZE];
315 float *C_expected = new float[SIZE];
316
317 std::random_device rd;
318 std::mt19937 gen(rd());
320for (int i = 0; i < SIZE; ++i)
321{
322 A[i] = dist(gen);
323}
324
325// Compute C_expected
326for (int t = 0; t < T; ++t)
327{
328 for (int r = 0; r < R; ++r)
329 {
330 for (int u = 0; u < U; ++u)
331 {
332 for (int s = 0; s < S; ++s)
333 {
334 // Calculate index in output format (t,r,u,s) using strides_out
335 int l_idx_c_exp = t * (U * R * S) + r * S + u * (R * S) + s;
336 // Calculate index in input format (t,u,r,s) using strides_in0
337 int l_idx_a = t * (R * U * S) + u * S + r * (U * S) + s;
338 C_expected[l_idx_c_exp] = A[l_idx_a];
339 }
340 }
341 }
Then we would prepare the execution, by setting all arguments accordingly:
344std::vector<mini_jit::dim_t> dim_types = {
345 mini_jit::dim_t::c, // t
346 mini_jit::dim_t::c, // r
347 mini_jit::dim_t::c, // u
348 mini_jit::dim_t::c // s
349};
350
351std::vector<mini_jit::exec_t> exec_types = {
352 mini_jit::exec_t::seq, // t
353 mini_jit::exec_t::seq, // r
354 mini_jit::exec_t::prim, // u
355 mini_jit::exec_t::prim // s
356};
357
358std::vector<int64_t> dim_sizes = {
359 T, R, U, S};
360
361std::vector<int64_t> strides_in0 = {
362 R * U * S, // t
363 U * S, // r
364 S, // u
365 1 // s
366};
367
368std::vector<int64_t> strides_in1 = {0, 0, 0, 0};
369
370std::vector<int64_t> strides_out = {
371 U * R * S, // t
372 S, // r
373 R * S, // u
374 1 // s
375};
Finally, we would execute our implementation.