clang 20.0.0git
fmaintrin.h
Go to the documentation of this file.
1/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __FMAINTRIN_H
15#define __FMAINTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
20
21/// Computes a multiply-add of 128-bit vectors of [4 x float].
22/// For each element, computes <c> (__A * __B) + __C </c>.
23///
24/// \headerfile <immintrin.h>
25///
26/// This intrinsic corresponds to the \c VFMADD213PS instruction.
27///
28/// \param __A
29/// A 128-bit vector of [4 x float] containing the multiplicand.
30/// \param __B
31/// A 128-bit vector of [4 x float] containing the multiplier.
32/// \param __C
33/// A 128-bit vector of [4 x float] containing the addend.
34/// \returns A 128-bit vector of [4 x float] containing the result.
35static __inline__ __m128 __DEFAULT_FN_ATTRS128
36_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
37{
38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
39}
40
41/// Computes a multiply-add of 128-bit vectors of [2 x double].
42/// For each element, computes <c> (__A * __B) + __C </c>.
43///
44/// \headerfile <immintrin.h>
45///
46/// This intrinsic corresponds to the \c VFMADD213PD instruction.
47///
48/// \param __A
49/// A 128-bit vector of [2 x double] containing the multiplicand.
50/// \param __B
51/// A 128-bit vector of [2 x double] containing the multiplier.
52/// \param __C
53/// A 128-bit vector of [2 x double] containing the addend.
54/// \returns A 128-bit [2 x double] vector containing the result.
55static __inline__ __m128d __DEFAULT_FN_ATTRS128
56_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
57{
58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
59}
60
61/// Computes a scalar multiply-add of the single-precision values in the
62/// low 32 bits of 128-bit vectors of [4 x float].
63///
64/// \code{.operation}
65/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
66/// result[127:32] = __A[127:32]
67/// \endcode
68///
69/// \headerfile <immintrin.h>
70///
71/// This intrinsic corresponds to the \c VFMADD213SS instruction.
72///
73/// \param __A
74/// A 128-bit vector of [4 x float] containing the multiplicand in the low
75/// 32 bits.
76/// \param __B
77/// A 128-bit vector of [4 x float] containing the multiplier in the low
78/// 32 bits.
79/// \param __C
80/// A 128-bit vector of [4 x float] containing the addend in the low
81/// 32 bits.
82/// \returns A 128-bit vector of [4 x float] containing the result in the low
83/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
84static __inline__ __m128 __DEFAULT_FN_ATTRS128
85_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
86{
87 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
88}
89
90/// Computes a scalar multiply-add of the double-precision values in the
91/// low 64 bits of 128-bit vectors of [2 x double].
92///
93/// \code{.operation}
94/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
95/// result[127:64] = __A[127:64]
96/// \endcode
97///
98/// \headerfile <immintrin.h>
99///
100/// This intrinsic corresponds to the \c VFMADD213SD instruction.
101///
102/// \param __A
103/// A 128-bit vector of [2 x double] containing the multiplicand in the low
104/// 64 bits.
105/// \param __B
106/// A 128-bit vector of [2 x double] containing the multiplier in the low
107/// 64 bits.
108/// \param __C
109/// A 128-bit vector of [2 x double] containing the addend in the low
110/// 64 bits.
111/// \returns A 128-bit vector of [2 x double] containing the result in the low
112/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
113static __inline__ __m128d __DEFAULT_FN_ATTRS128
114_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
115{
116 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
117}
118
119/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
120/// For each element, computes <c> (__A * __B) - __C </c>.
121///
122/// \headerfile <immintrin.h>
123///
124/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
125///
126/// \param __A
127/// A 128-bit vector of [4 x float] containing the multiplicand.
128/// \param __B
129/// A 128-bit vector of [4 x float] containing the multiplier.
130/// \param __C
131/// A 128-bit vector of [4 x float] containing the subtrahend.
132/// \returns A 128-bit vector of [4 x float] containing the result.
133static __inline__ __m128 __DEFAULT_FN_ATTRS128
134_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
135{
136 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
137}
138
139/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
140/// For each element, computes <c> (__A * __B) - __C </c>.
141///
142/// \headerfile <immintrin.h>
143///
144/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
145///
146/// \param __A
147/// A 128-bit vector of [2 x double] containing the multiplicand.
148/// \param __B
149/// A 128-bit vector of [2 x double] containing the multiplier.
150/// \param __C
151/// A 128-bit vector of [2 x double] containing the addend.
152/// \returns A 128-bit vector of [2 x double] containing the result.
153static __inline__ __m128d __DEFAULT_FN_ATTRS128
154_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
155{
156 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
157}
158
159/// Computes a scalar multiply-subtract of the single-precision values in
160/// the low 32 bits of 128-bit vectors of [4 x float].
161///
162/// \code{.operation}
163/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
164/// result[127:32] = __A[127:32]
165/// \endcode
166///
167/// \headerfile <immintrin.h>
168///
169/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
170///
171/// \param __A
172/// A 128-bit vector of [4 x float] containing the multiplicand in the low
173/// 32 bits.
174/// \param __B
175/// A 128-bit vector of [4 x float] containing the multiplier in the low
176/// 32 bits.
177/// \param __C
178/// A 128-bit vector of [4 x float] containing the subtrahend in the low
179/// 32 bits.
180/// \returns A 128-bit vector of [4 x float] containing the result in the low
181/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
182static __inline__ __m128 __DEFAULT_FN_ATTRS128
183_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
184{
185 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
186}
187
188/// Computes a scalar multiply-subtract of the double-precision values in
189/// the low 64 bits of 128-bit vectors of [2 x double].
190///
191/// \code{.operation}
192/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
193/// result[127:64] = __A[127:64]
194/// \endcode
195///
196/// \headerfile <immintrin.h>
197///
198/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
199///
200/// \param __A
201/// A 128-bit vector of [2 x double] containing the multiplicand in the low
202/// 64 bits.
203/// \param __B
204/// A 128-bit vector of [2 x double] containing the multiplier in the low
205/// 64 bits.
206/// \param __C
207/// A 128-bit vector of [2 x double] containing the subtrahend in the low
208/// 64 bits.
209/// \returns A 128-bit vector of [2 x double] containing the result in the low
210/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
211static __inline__ __m128d __DEFAULT_FN_ATTRS128
212_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
213{
214 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
215}
216
217/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
218/// For each element, computes <c> -(__A * __B) + __C </c>.
219///
220/// \headerfile <immintrin.h>
221///
222/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
223///
224/// \param __A
225/// A 128-bit vector of [4 x float] containing the multiplicand.
226/// \param __B
227/// A 128-bit vector of [4 x float] containing the multiplier.
228/// \param __C
229/// A 128-bit vector of [4 x float] containing the addend.
230/// \returns A 128-bit [4 x float] vector containing the result.
231static __inline__ __m128 __DEFAULT_FN_ATTRS128
232_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
233{
234 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
235}
236
237/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
238/// For each element, computes <c> -(__A * __B) + __C </c>.
239///
240/// \headerfile <immintrin.h>
241///
242/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
243///
244/// \param __A
245/// A 128-bit vector of [2 x double] containing the multiplicand.
246/// \param __B
247/// A 128-bit vector of [2 x double] containing the multiplier.
248/// \param __C
249/// A 128-bit vector of [2 x double] containing the addend.
250/// \returns A 128-bit vector of [2 x double] containing the result.
251static __inline__ __m128d __DEFAULT_FN_ATTRS128
252_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
253{
254 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
255}
256
257/// Computes a scalar negated multiply-add of the single-precision values in
258/// the low 32 bits of 128-bit vectors of [4 x float].
259///
260/// \code{.operation}
261/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
262/// result[127:32] = __A[127:32]
263/// \endcode
264///
265/// \headerfile <immintrin.h>
266///
267/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
268///
269/// \param __A
270/// A 128-bit vector of [4 x float] containing the multiplicand in the low
271/// 32 bits.
272/// \param __B
273/// A 128-bit vector of [4 x float] containing the multiplier in the low
274/// 32 bits.
275/// \param __C
276/// A 128-bit vector of [4 x float] containing the addend in the low
277/// 32 bits.
278/// \returns A 128-bit vector of [4 x float] containing the result in the low
279/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
280static __inline__ __m128 __DEFAULT_FN_ATTRS128
281_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
282{
283 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
284}
285
286/// Computes a scalar negated multiply-add of the double-precision values
287/// in the low 64 bits of 128-bit vectors of [2 x double].
288///
289/// \code{.operation}
290/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
291/// result[127:64] = __A[127:64]
292/// \endcode
293///
294/// \headerfile <immintrin.h>
295///
296/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
297///
298/// \param __A
299/// A 128-bit vector of [2 x double] containing the multiplicand in the low
300/// 64 bits.
301/// \param __B
302/// A 128-bit vector of [2 x double] containing the multiplier in the low
303/// 64 bits.
304/// \param __C
305/// A 128-bit vector of [2 x double] containing the addend in the low
306/// 64 bits.
307/// \returns A 128-bit vector of [2 x double] containing the result in the low
308/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
309static __inline__ __m128d __DEFAULT_FN_ATTRS128
310_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
311{
312 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
313}
314
315/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
316/// For each element, computes <c> -(__A * __B) - __C </c>.
317///
318/// \headerfile <immintrin.h>
319///
320/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
321///
322/// \param __A
323/// A 128-bit vector of [4 x float] containing the multiplicand.
324/// \param __B
325/// A 128-bit vector of [4 x float] containing the multiplier.
326/// \param __C
327/// A 128-bit vector of [4 x float] containing the subtrahend.
328/// \returns A 128-bit vector of [4 x float] containing the result.
329static __inline__ __m128 __DEFAULT_FN_ATTRS128
330_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
331{
332 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
333}
334
335/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
336/// For each element, computes <c> -(__A * __B) - __C </c>.
337///
338/// \headerfile <immintrin.h>
339///
340/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
341///
342/// \param __A
343/// A 128-bit vector of [2 x double] containing the multiplicand.
344/// \param __B
345/// A 128-bit vector of [2 x double] containing the multiplier.
346/// \param __C
347/// A 128-bit vector of [2 x double] containing the subtrahend.
348/// \returns A 128-bit vector of [2 x double] containing the result.
349static __inline__ __m128d __DEFAULT_FN_ATTRS128
350_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
351{
352 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
353}
354
355/// Computes a scalar negated multiply-subtract of the single-precision
356/// values in the low 32 bits of 128-bit vectors of [4 x float].
357///
358/// \code{.operation}
359/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
360/// result[127:32] = __A[127:32]
361/// \endcode
362///
363/// \headerfile <immintrin.h>
364///
365/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
366///
367/// \param __A
368/// A 128-bit vector of [4 x float] containing the multiplicand in the low
369/// 32 bits.
370/// \param __B
371/// A 128-bit vector of [4 x float] containing the multiplier in the low
372/// 32 bits.
373/// \param __C
374/// A 128-bit vector of [4 x float] containing the subtrahend in the low
375/// 32 bits.
376/// \returns A 128-bit vector of [4 x float] containing the result in the low
377/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
378static __inline__ __m128 __DEFAULT_FN_ATTRS128
379_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
380{
381 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
382}
383
384/// Computes a scalar negated multiply-subtract of the double-precision
385/// values in the low 64 bits of 128-bit vectors of [2 x double].
386///
387/// \code{.operation}
388/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
389/// result[127:64] = __A[127:64]
390/// \endcode
391///
392/// \headerfile <immintrin.h>
393///
394/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
395///
396/// \param __A
397/// A 128-bit vector of [2 x double] containing the multiplicand in the low
398/// 64 bits.
399/// \param __B
400/// A 128-bit vector of [2 x double] containing the multiplier in the low
401/// 64 bits.
402/// \param __C
403/// A 128-bit vector of [2 x double] containing the subtrahend in the low
404/// 64 bits.
405/// \returns A 128-bit vector of [2 x double] containing the result in the low
406/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
407static __inline__ __m128d __DEFAULT_FN_ATTRS128
408_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
409{
410 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
411}
412
413/// Computes a multiply with alternating add/subtract of 128-bit vectors of
414/// [4 x float].
415///
416/// \code{.operation}
417/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
418/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
419/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
420/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
421/// \endcode
422///
423/// \headerfile <immintrin.h>
424///
425/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
426///
427/// \param __A
428/// A 128-bit vector of [4 x float] containing the multiplicand.
429/// \param __B
430/// A 128-bit vector of [4 x float] containing the multiplier.
431/// \param __C
432/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
433/// \returns A 128-bit vector of [4 x float] containing the result.
434static __inline__ __m128 __DEFAULT_FN_ATTRS128
435_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
436{
437 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
438}
439
440/// Computes a multiply with alternating add/subtract of 128-bit vectors of
441/// [2 x double].
442///
443/// \code{.operation}
444/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
445/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
446/// \endcode
447///
448/// \headerfile <immintrin.h>
449///
450/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
451///
452/// \param __A
453/// A 128-bit vector of [2 x double] containing the multiplicand.
454/// \param __B
455/// A 128-bit vector of [2 x double] containing the multiplier.
456/// \param __C
457/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
458/// \returns A 128-bit vector of [2 x double] containing the result.
459static __inline__ __m128d __DEFAULT_FN_ATTRS128
460_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
461{
462 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
463}
464
465/// Computes a multiply with alternating add/subtract of 128-bit vectors of
466/// [4 x float].
467///
468/// \code{.operation}
469/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
470/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
471/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
472/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
473/// \endcode
474///
475/// \headerfile <immintrin.h>
476///
477/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
478///
479/// \param __A
480/// A 128-bit vector of [4 x float] containing the multiplicand.
481/// \param __B
482/// A 128-bit vector of [4 x float] containing the multiplier.
483/// \param __C
484/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
485/// \returns A 128-bit vector of [4 x float] containing the result.
486static __inline__ __m128 __DEFAULT_FN_ATTRS128
487_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
488{
489 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
490}
491
492/// Computes a multiply with alternating add/subtract of 128-bit vectors of
493/// [2 x double].
494///
495/// \code{.operation}
496/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
497/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
498/// \endcode
499///
500/// \headerfile <immintrin.h>
501///
502/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
503///
504/// \param __A
505/// A 128-bit vector of [2 x double] containing the multiplicand.
506/// \param __B
507/// A 128-bit vector of [2 x double] containing the multiplier.
508/// \param __C
509/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
510/// \returns A 128-bit vector of [2 x double] containing the result.
511static __inline__ __m128d __DEFAULT_FN_ATTRS128
512_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
513{
514 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
515}
516
517/// Computes a multiply-add of 256-bit vectors of [8 x float].
518/// For each element, computes <c> (__A * __B) + __C </c>.
519///
520/// \headerfile <immintrin.h>
521///
522/// This intrinsic corresponds to the \c VFMADD213PS instruction.
523///
524/// \param __A
525/// A 256-bit vector of [8 x float] containing the multiplicand.
526/// \param __B
527/// A 256-bit vector of [8 x float] containing the multiplier.
528/// \param __C
529/// A 256-bit vector of [8 x float] containing the addend.
530/// \returns A 256-bit vector of [8 x float] containing the result.
531static __inline__ __m256 __DEFAULT_FN_ATTRS256
532_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
533{
534 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
535}
536
537/// Computes a multiply-add of 256-bit vectors of [4 x double].
538/// For each element, computes <c> (__A * __B) + __C </c>.
539///
540/// \headerfile <immintrin.h>
541///
542/// This intrinsic corresponds to the \c VFMADD213PD instruction.
543///
544/// \param __A
545/// A 256-bit vector of [4 x double] containing the multiplicand.
546/// \param __B
547/// A 256-bit vector of [4 x double] containing the multiplier.
548/// \param __C
549/// A 256-bit vector of [4 x double] containing the addend.
550/// \returns A 256-bit vector of [4 x double] containing the result.
551static __inline__ __m256d __DEFAULT_FN_ATTRS256
552_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
553{
554 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
555}
556
557/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
558/// For each element, computes <c> (__A * __B) - __C </c>.
559///
560/// \headerfile <immintrin.h>
561///
562/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
563///
564/// \param __A
565/// A 256-bit vector of [8 x float] containing the multiplicand.
566/// \param __B
567/// A 256-bit vector of [8 x float] containing the multiplier.
568/// \param __C
569/// A 256-bit vector of [8 x float] containing the subtrahend.
570/// \returns A 256-bit vector of [8 x float] containing the result.
571static __inline__ __m256 __DEFAULT_FN_ATTRS256
572_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
573{
574 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
575}
576
577/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
578/// For each element, computes <c> (__A * __B) - __C </c>.
579///
580/// \headerfile <immintrin.h>
581///
582/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
583///
584/// \param __A
585/// A 256-bit vector of [4 x double] containing the multiplicand.
586/// \param __B
587/// A 256-bit vector of [4 x double] containing the multiplier.
588/// \param __C
589/// A 256-bit vector of [4 x double] containing the subtrahend.
590/// \returns A 256-bit vector of [4 x double] containing the result.
591static __inline__ __m256d __DEFAULT_FN_ATTRS256
592_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
593{
594 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
595}
596
597/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
598/// For each element, computes <c> -(__A * __B) + __C </c>.
599///
600/// \headerfile <immintrin.h>
601///
602/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
603///
604/// \param __A
605/// A 256-bit vector of [8 x float] containing the multiplicand.
606/// \param __B
607/// A 256-bit vector of [8 x float] containing the multiplier.
608/// \param __C
609/// A 256-bit vector of [8 x float] containing the addend.
610/// \returns A 256-bit vector of [8 x float] containing the result.
611static __inline__ __m256 __DEFAULT_FN_ATTRS256
612_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
613{
614 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
615}
616
617/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
618/// For each element, computes <c> -(__A * __B) + __C </c>.
619///
620/// \headerfile <immintrin.h>
621///
622/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
623///
624/// \param __A
625/// A 256-bit vector of [4 x double] containing the multiplicand.
626/// \param __B
627/// A 256-bit vector of [4 x double] containing the multiplier.
628/// \param __C
629/// A 256-bit vector of [4 x double] containing the addend.
630/// \returns A 256-bit vector of [4 x double] containing the result.
631static __inline__ __m256d __DEFAULT_FN_ATTRS256
632_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
633{
634 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
635}
636
637/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
638/// For each element, computes <c> -(__A * __B) - __C </c>.
639///
640/// \headerfile <immintrin.h>
641///
642/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
643///
644/// \param __A
645/// A 256-bit vector of [8 x float] containing the multiplicand.
646/// \param __B
647/// A 256-bit vector of [8 x float] containing the multiplier.
648/// \param __C
649/// A 256-bit vector of [8 x float] containing the subtrahend.
650/// \returns A 256-bit vector of [8 x float] containing the result.
651static __inline__ __m256 __DEFAULT_FN_ATTRS256
652_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
653{
654 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
655}
656
657/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
658/// For each element, computes <c> -(__A * __B) - __C </c>.
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
663///
664/// \param __A
665/// A 256-bit vector of [4 x double] containing the multiplicand.
666/// \param __B
667/// A 256-bit vector of [4 x double] containing the multiplier.
668/// \param __C
669/// A 256-bit vector of [4 x double] containing the subtrahend.
670/// \returns A 256-bit vector of [4 x double] containing the result.
671static __inline__ __m256d __DEFAULT_FN_ATTRS256
672_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
673{
674 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
675}
676
677/// Computes a multiply with alternating add/subtract of 256-bit vectors of
678/// [8 x float].
679///
680/// \code{.operation}
681/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
682/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
683/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
684/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
685/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
686/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
687/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
688/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
689/// \endcode
690///
691/// \headerfile <immintrin.h>
692///
693/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
694///
695/// \param __A
696/// A 256-bit vector of [8 x float] containing the multiplicand.
697/// \param __B
698/// A 256-bit vector of [8 x float] containing the multiplier.
699/// \param __C
700/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
701/// \returns A 256-bit vector of [8 x float] containing the result.
702static __inline__ __m256 __DEFAULT_FN_ATTRS256
703_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
704{
705 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
706}
707
708/// Computes a multiply with alternating add/subtract of 256-bit vectors of
709/// [4 x double].
710///
711/// \code{.operation}
712/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
713/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
714/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
715/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
716/// \endcode
717///
718/// \headerfile <immintrin.h>
719///
720/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
721///
722/// \param __A
723/// A 256-bit vector of [4 x double] containing the multiplicand.
724/// \param __B
725/// A 256-bit vector of [4 x double] containing the multiplier.
726/// \param __C
727/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
728/// \returns A 256-bit vector of [4 x double] containing the result.
729static __inline__ __m256d __DEFAULT_FN_ATTRS256
730_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
731{
732 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
733}
734
735/// Computes a vector multiply with alternating add/subtract of 256-bit
736/// vectors of [8 x float].
737///
738/// \code{.operation}
739/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
740/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
741/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
742/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
743/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
744/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
745/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
746/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
747/// \endcode
748///
749/// \headerfile <immintrin.h>
750///
751/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
752///
753/// \param __A
754/// A 256-bit vector of [8 x float] containing the multiplicand.
755/// \param __B
756/// A 256-bit vector of [8 x float] containing the multiplier.
757/// \param __C
758/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
759/// \returns A 256-bit vector of [8 x float] containing the result.
760static __inline__ __m256 __DEFAULT_FN_ATTRS256
761_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
762{
763 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
764}
765
766/// Computes a vector multiply with alternating add/subtract of 256-bit
767/// vectors of [4 x double].
768///
769/// \code{.operation}
770/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
771/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
772/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
773/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
774/// \endcode
775///
776/// \headerfile <immintrin.h>
777///
778/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
779///
780/// \param __A
781/// A 256-bit vector of [4 x double] containing the multiplicand.
782/// \param __B
783/// A 256-bit vector of [4 x double] containing the multiplier.
784/// \param __C
785/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
786/// \returns A 256-bit vector of [4 x double] containing the result.
787static __inline__ __m256d __DEFAULT_FN_ATTRS256
788_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
789{
790 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
791}
792
793#undef __DEFAULT_FN_ATTRS128
794#undef __DEFAULT_FN_ATTRS256
795
796#endif /* __FMAINTRIN_H */
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-subtract of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:134
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:761
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:350
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:703
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:487
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:435
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:730
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:512
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:652
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-subtract of the single-precision values in the low 32 bits of 128-bit vect...
Definition: fmaintrin.h:183
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-subtract of the double-precision values in the low 64 bits of 128-bit vect...
Definition: fmaintrin.h:212
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-add of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:56
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:672
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-add of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:632
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-subtract of the single-precision values in the low 32 bits of 128-...
Definition: fmaintrin.h:379
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-add of the double-precision values in the low 64 bits of 128-bit vectors o...
Definition: fmaintrin.h:114
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:460
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-add of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:36
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-subtract of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:572
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-add of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:232
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-add of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:252
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-add of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:532
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:788
#define __DEFAULT_FN_ATTRS256
Definition: fmaintrin.h:19
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-subtract of 128-bit vectors of [2 x double].
Definition: fmaintrin.h:154
#define __DEFAULT_FN_ATTRS128
Definition: fmaintrin.h:18
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-add of the double-precision values in the low 64 bits of 128-bit v...
Definition: fmaintrin.h:310
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-subtract of the double-precision values in the low 64 bits of 128-...
Definition: fmaintrin.h:408
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-add of 256-bit vectors of [8 x float].
Definition: fmaintrin.h:612
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-add of the single-precision values in the low 32 bits of 128-bit v...
Definition: fmaintrin.h:281
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
Definition: fmaintrin.h:330
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-add of the single-precision values in the low 32 bits of 128-bit vectors o...
Definition: fmaintrin.h:85
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-subtract of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:592
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-add of 256-bit vectors of [4 x double].
Definition: fmaintrin.h:552