11#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
14#ifndef __AMX_TRANSPOSEINTRIN_H
15#define __AMX_TRANSPOSEINTRIN_H
18#define __DEFAULT_FN_ATTRS_TRANSPOSE \
19 __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
21#define _tile_2rpntlvwz0(tdst, base, stride) \
22 __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
23#define _tile_2rpntlvwz0t1(tdst, base, stride) \
24 __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
25#define _tile_2rpntlvwz1(tdst, base, stride) \
26 __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
27#define _tile_2rpntlvwz1t1(tdst, base, stride) \
28 __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
58#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
60static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
61 unsigned short row,
unsigned short col0,
unsigned short col1,
62 _tile1024i *dst0, _tile1024i *dst1,
const void *base,
63 __SIZE_TYPE__ stride) {
66 __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
67 (_tile1024i_1024a *)dst1, base,
68 (__SIZE_TYPE__)(stride));
71static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
72 unsigned short row,
unsigned short col0,
unsigned short col1,
73 _tile1024i *dst0, _tile1024i *dst1,
const void *base,
74 __SIZE_TYPE__ stride) {
75 __builtin_ia32_t2rpntlvwz0t1_internal(
76 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
77 (__SIZE_TYPE__)(stride));
80static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
81 unsigned short row,
unsigned short col0,
unsigned short col1,
82 _tile1024i *dst0, _tile1024i *dst1,
const void *base,
83 __SIZE_TYPE__ stride) {
84 __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
85 (_tile1024i_1024a *)dst1, base,
86 (__SIZE_TYPE__)(stride));
89static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
90 unsigned short row,
unsigned short col0,
unsigned short col1,
91 _tile1024i *dst0, _tile1024i *dst1,
const void *base,
92 __SIZE_TYPE__ stride) {
93 __builtin_ia32_t2rpntlvwz1t1_internal(
94 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
95 (__SIZE_TYPE__)(stride));
99static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
100_tile_transposed_internal(
unsigned short m,
unsigned short n, _tile1024i src) {
101 return __builtin_ia32_ttransposed_internal(m, n, src);
129__DEFAULT_FN_ATTRS_TRANSPOSE
130static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
131 const void *base, __SIZE_TYPE__ stride) {
132 _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
133 &dst1->tile, base, stride);
159__DEFAULT_FN_ATTRS_TRANSPOSE
160static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
161 const void *base, __SIZE_TYPE__ stride) {
162 _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
163 &dst1->tile, base, stride);
192__DEFAULT_FN_ATTRS_TRANSPOSE
193static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
194 const void *base, __SIZE_TYPE__ stride) {
195 _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
196 &dst1->tile, base, stride);
225__DEFAULT_FN_ATTRS_TRANSPOSE
226static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
227 const void *base, __SIZE_TYPE__ stride) {
228 _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
229 &dst1->tile, base, stride);
242__DEFAULT_FN_ATTRS_TRANSPOSE
243static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
244 dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);