mini_jit

enum class mini_jit::exec_t : uint32_t

execution type

Values:

enumerator seq
enumerator prim
enumerator shared
enumerator undefined
enum class mini_jit::ptype_t : uint32_t

primitive type

Values:

enumerator zero
enumerator identity
enumerator relu
enumerator gemm
enumerator brgemm
enumerator square
enumerator reciprocal
enumerator increment
enumerator decrement
enumerator add
enumerator sub
enumerator mul
enumerator div
enumerator min
enumerator max
enumerator fast_sigmoid
enumerator sigmoid_interp
enumerator sigmoid_taylor
enumerator none
enum class mini_jit::dim_t : uint32_t

dimension type

Values:

enumerator c
enumerator m
enumerator n
enumerator k
enum class mini_jit::dtype_t : uint32_t

data type

Values:

enumerator fp32
enumerator fp64
enum class mini_jit::error_t : int32_t

error codes

Values:

enumerator success
enumerator wrong_dimension
enumerator wrong_ptype
enumerator operation_not_supported
enumerator wrong_matrix_ordering_format
enumerator wrong_dtype
enumerator wrong_exec_type
inline const std::string mini_jit::to_string(exec_t e)
inline const std::string mini_jit::to_string(ptype_t p)
inline const std::string mini_jit::to_string(dim_t d)
inline const std::string mini_jit::to_string(dtype_t d)
inline const std::string mini_jit::to_string(error_t e)
class Benchmark

Subclassed by mini_jit::benchmarks::EinsumTreeBench, mini_jit::benchmarks::FastSigmoidPrimitiveBench, mini_jit::benchmarks::IdentityPrimitiveBench, mini_jit::benchmarks::IdentityTransPrimitiveBench, mini_jit::benchmarks::MatmulBrMNKBench, mini_jit::benchmarks::MatmulMNKBench, mini_jit::benchmarks::ReLUPrimitiveBench, mini_jit::benchmarks::ReLUTransPrimitiveBench, mini_jit::benchmarks::ReciprocalPrimitiveBench, mini_jit::benchmarks::SigmoidInterpolationPrimitiveBench, mini_jit::benchmarks::SigmoidTaylorPrimitiveBench, mini_jit::benchmarks::SquarePrimitiveBench, mini_jit::benchmarks::SquareTransPrimitiveBench, mini_jit::benchmarks::TensorOperationBench, mini_jit::benchmarks::ZeroEorPrimitiveBench, mini_jit::benchmarks::ZeroXZRPrimitiveBench

Public Functions

inline virtual ~Benchmark()
virtual void run() = 0

Runs the benchmark.

inline benchmark_result getResult()

Returns the result of the benchmark.

struct benchmark_result

Public Members

long numReps = 0
double elapsedSeconds = 0.0f
long totalNumberElements = 0
long totalOperations = 0
double gflops = 0.0f
double totalDataProcessed = 0.0f
double gibps = 0.0f
class Binary

Public Types

using kernel_t = void (*)(void const *a, void const *b, void *c, int64_t ld_a, int64_t ld_b, int64_t ld_c)

Public Functions

inline ~Binary() noexcept

Destructor.

error_t generate(uint32_t m, uint32_t n, uint32_t trans_c, mini_jit::dtype_t dtype, mini_jit::ptype_t ptype)

Generate a kernel for a binary primitive.

Parameters:
  • m – Number of rows.

  • n – Number of columns.

  • trans_c – 0 if C is stored in column-major order, 1 if C is stored in row-major order.

  • dtype – Data type of the matrices.

  • ptype – Primitive type.

Returns:

error_t::success on success, another error_t value otherwise.

kernel_t get_kernel() const

Get the generated kernel: C := op(A, B).

Returns:

pointer to the generated kernel.

class Brgemm

Public Types

using kernel_t = void (*)(void const *a, void const *b, void *c, int64_t ld_a, int64_t ld_b, int64_t ld_c, int64_t br_stride_a, int64_t br_stride_b)

Public Functions

inline ~Brgemm() noexcept

Destructor.

error_t generate(uint32_t m, uint32_t n, uint32_t k, uint32_t br_size, uint32_t trans_a, uint32_t trans_b, uint32_t trans_c, mini_jit::dtype_t dtype)

Generate a kernel for batch-reduce matrix multiplication.

Parameters:
  • m – number of rows in A and C.

  • n – number of columns in B and C.

  • k – number of columns in A and rows in B.

  • br_size – batch-reduce size.

  • trans_a – 0 if A is stored in column-major order, 1 if A is stored in row-major order.

  • trans_b – 0 if B is stored in column-major order, 1 if B is stored in row-major order.

  • trans_c – 0 if C is stored in column-major order, 1 if C is stored in row-major order.

  • dtype – data type of the matrices.

Returns:

error_t::success on success, another error_t value otherwise.

kernel_t get_kernel() const

Get the generated kernel: C += sum_i(A_i * B_i).

Returns:

pointer to the generated kernel.

class Kernel

Public Functions

inline Kernel()

Constructor

~Kernel() noexcept

Destructor

Kernel(Kernel const&) = delete
Kernel &operator=(Kernel const&) = delete
Kernel(Kernel&&) noexcept = delete
Kernel &operator=(Kernel&&) noexcept = delete
void add_instr(uint32_t ins)

Adds an instruction to the code buffer.

Parameters:

ins – instruction which is added.

void add_instr(std::vector<uint32_t> ins)

Adds a vector of instructions to the code buffer.

Parameters:

ins – instructions which are added.

void add_label(std::string const &label)

Adds a label to the code buffer.

Parameters:

label – label which is added.

int getInstrCountFromLabel(std::string const &label) const

Returns how many instructions come after the given label.

Parameters:

label – label to search for.

Returns:

number of instructions after the label.

std::size_t get_size() const

Gets the size of the code buffer.

Returns:

size of the code buffer in bytes.

void set_kernel()

Sets the kernel based on the code buffer.

void const *get_kernel() const

Gets a pointer to the executable kernel.

void write(char const *filename) const

Writes the code buffer to the given file.

Parameters:

filename – name of the file to write to.

class TensorOperation

Public Functions

error_t setup(dtype_t dtype, ptype_t prim_first_touch, ptype_t prim_main, ptype_t prim_last_touch, std::span<const dim_t> dim_types, std::span<const exec_t> exec_types, std::span<const int64_t> dim_sizes, std::span<const int64_t> strides_in0, std::span<const int64_t> strides_in1, std::span<const int64_t> strides_out)

Setup for a binary tensor contraction or a unary tensor operation.

Parameters:
  • dtype – Datatype of all tensor elements.

  • prim_first_touch – Type of the first touch primitive.

  • prim_main – Type of the main primitive.

  • prim_last_touch – Type of the last touch primitive.

  • dim_types – Dimension type of the loops (c, m, n, or k).

  • exec_types – Execution type of the loops (seq, shared, or prim).

  • dim_sizes – Sizes of the dimensions.

  • strides_in0 – Strides of the first input tensor.

  • strides_in1 – Strides of the second input tensor (ignored if unary).

  • strides_out – Strides of the output tensor.

Returns:

error_t::success on success, another error_t value otherwise.

void execute(void const *tensor_in0, void const *tensor_in1, void *tensor_out)

Execute the tensor operation.

Parameters:
  • tensor_in0 – First input tensor.

  • tensor_in1 – Second input tensor (use nullptr if unary).

  • tensor_out – Output tensor.

void execute_iter(int64_t id_loop, char const *ptr_in0, char const *ptr_in1, char *ptr_out, bool first_access, bool last_access)

General-purpose loop implementation featuring first and last touch operations. No threading is applied.

Parameters:
  • id_loop – Dimension id of the loop which is executed.

  • ptr_in0 – Pointer to the first input tensor’s data.

  • ptr_in1 – Pointer to the second input tensor’s data (use nullptr if unary).

  • ptr_out – Pointer to the output tensor’s data.

  • first_access – True if first time accessing data of output tensor.

  • last_access – True if last time accessing data of output tensor.

void execute_iter_parallel(char const *ptr_in0, char const *ptr_in1, char *ptr_out, bool first_access, bool last_access)

General-purpose loop implementation featuring first and last touch operations with parallelization.

Parameters:
  • ptr_in0 – Pointer to the first input tensor’s data.

  • ptr_in1 – Pointer to the second input tensor’s data (use nullptr if unary).

  • ptr_out – Pointer to the output tensor’s data.

  • first_access – True if first time accessing data of output tensor.

  • last_access – True if last time accessing data of output tensor.

inline int dtype_size() const
class Unary

Public Types

using kernel_t = void (*)(void const *a, void *b, int64_t ld_a, int64_t ld_b, void *extra)

Public Functions

inline ~Unary() noexcept

Destructor.

error_t generate(uint32_t m, uint32_t n, uint32_t trans_b, mini_jit::dtype_t dtype, mini_jit::ptype_t ptype)

Generate a kernel for a unary primitive.

Parameters:
  • m – Number of rows in A and B.

  • n – Number of columns in A and B.

  • trans_b – 0 if B is stored in column-major order, 1 if B is stored in row-major order.

  • dtype – Data type of the matrices.

  • ptype – Primitive type.

Returns:

error_t::success on success, another error_t value otherwise.

kernel_t get_kernel() const

Get the generated kernel: B := op(A).

Returns:

pointer to the generated kernel.

void set_extra(void *extra)

Set extra/context pointer for kernels that need it (e.g., lookup table).

void *get_extra() const

Get the extra/context pointer.