Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32f_x2_subtract_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59#define INCLUDED_volk_32f_x2_subtract_32f_a_H
60
61#include <inttypes.h>
62#include <stdio.h>
63
64#ifdef LV_HAVE_AVX512F
65#include <immintrin.h>
66
67static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71{
72 unsigned int number = 0;
73 const unsigned int sixteenthPoints = num_points / 16;
74
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
78
79 __m512 aVal, bVal, cVal;
80 for (; number < sixteenthPoints; number++) {
81
82 aVal = _mm512_load_ps(aPtr);
83 bVal = _mm512_load_ps(bPtr);
84
85 cVal = _mm512_sub_ps(aVal, bVal);
86
87 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
88
89 aPtr += 16;
90 bPtr += 16;
91 cPtr += 16;
92 }
93
94 number = sixteenthPoints * 16;
95 for (; number < num_points; number++) {
96 *cPtr++ = (*aPtr++) - (*bPtr++);
97 }
98}
99#endif /* LV_HAVE_AVX512F */
100
101#ifdef LV_HAVE_AVX
102#include <immintrin.h>
103
104static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
108{
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
111
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
115
116 __m256 aVal, bVal, cVal;
117 for (; number < eighthPoints; number++) {
118
119 aVal = _mm256_load_ps(aPtr);
120 bVal = _mm256_load_ps(bPtr);
121
122 cVal = _mm256_sub_ps(aVal, bVal);
123
124 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
125
126 aPtr += 8;
127 bPtr += 8;
128 cPtr += 8;
129 }
130
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) - (*bPtr++);
134 }
135}
136#endif /* LV_HAVE_AVX */
137
138#ifdef LV_HAVE_SSE
139#include <xmmintrin.h>
140
141static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
152
153 __m128 aVal, bVal, cVal;
154 for (; number < quarterPoints; number++) {
155
156 aVal = _mm_load_ps(aPtr);
157 bVal = _mm_load_ps(bPtr);
158
159 cVal = _mm_sub_ps(aVal, bVal);
160
161 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
162
163 aPtr += 4;
164 bPtr += 4;
165 cPtr += 4;
166 }
167
168 number = quarterPoints * 4;
169 for (; number < num_points; number++) {
170 *cPtr++ = (*aPtr++) - (*bPtr++);
171 }
172}
173#endif /* LV_HAVE_SSE */
174
175
176#ifdef LV_HAVE_GENERIC
177
178static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
182{
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186 unsigned int number = 0;
187
188 for (number = 0; number < num_points; number++) {
189 *cPtr++ = (*aPtr++) - (*bPtr++);
190 }
191}
192#endif /* LV_HAVE_GENERIC */
193
194
195#ifdef LV_HAVE_NEON
196#include <arm_neon.h>
197
198static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
199 const float* aVector,
200 const float* bVector,
201 unsigned int num_points)
202{
203 float* cPtr = cVector;
204 const float* aPtr = aVector;
205 const float* bPtr = bVector;
206 unsigned int number = 0;
207 unsigned int quarter_points = num_points / 4;
208
209 float32x4_t a_vec, b_vec, c_vec;
210
211 for (number = 0; number < quarter_points; number++) {
212 a_vec = vld1q_f32(aPtr);
213 b_vec = vld1q_f32(bPtr);
214 c_vec = vsubq_f32(a_vec, b_vec);
215 vst1q_f32(cPtr, c_vec);
216 aPtr += 4;
217 bPtr += 4;
218 cPtr += 4;
219 }
220
221 for (number = quarter_points * 4; number < num_points; number++) {
222 *cPtr++ = (*aPtr++) - (*bPtr++);
223 }
224}
225#endif /* LV_HAVE_NEON */
226
227
228#ifdef LV_HAVE_ORC
229extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
230 const float* aVector,
231 const float* bVector,
232 unsigned int num_points);
233
234static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
235 const float* aVector,
236 const float* bVector,
237 unsigned int num_points)
238{
239 volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
240}
241#endif /* LV_HAVE_ORC */
242
243
244#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
245
246
247#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
248#define INCLUDED_volk_32f_x2_subtract_32f_u_H
249
250#include <inttypes.h>
251#include <stdio.h>
252
253#ifdef LV_HAVE_AVX512F
254#include <immintrin.h>
255
256static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
257 const float* aVector,
258 const float* bVector,
259 unsigned int num_points)
260{
261 unsigned int number = 0;
262 const unsigned int sixteenthPoints = num_points / 16;
263
264 float* cPtr = cVector;
265 const float* aPtr = aVector;
266 const float* bPtr = bVector;
267
268 __m512 aVal, bVal, cVal;
269 for (; number < sixteenthPoints; number++) {
270
271 aVal = _mm512_loadu_ps(aPtr);
272 bVal = _mm512_loadu_ps(bPtr);
273
274 cVal = _mm512_sub_ps(aVal, bVal);
275
276 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
277
278 aPtr += 16;
279 bPtr += 16;
280 cPtr += 16;
281 }
282
283 number = sixteenthPoints * 16;
284 for (; number < num_points; number++) {
285 *cPtr++ = (*aPtr++) - (*bPtr++);
286 }
287}
288#endif /* LV_HAVE_AVX512F */
289
290
291#ifdef LV_HAVE_AVX
292#include <immintrin.h>
293
294static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
295 const float* aVector,
296 const float* bVector,
297 unsigned int num_points)
298{
299 unsigned int number = 0;
300 const unsigned int eighthPoints = num_points / 8;
301
302 float* cPtr = cVector;
303 const float* aPtr = aVector;
304 const float* bPtr = bVector;
305
306 __m256 aVal, bVal, cVal;
307 for (; number < eighthPoints; number++) {
308
309 aVal = _mm256_loadu_ps(aPtr);
310 bVal = _mm256_loadu_ps(bPtr);
311
312 cVal = _mm256_sub_ps(aVal, bVal);
313
314 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
315
316 aPtr += 8;
317 bPtr += 8;
318 cPtr += 8;
319 }
320
321 number = eighthPoints * 8;
322 for (; number < num_points; number++) {
323 *cPtr++ = (*aPtr++) - (*bPtr++);
324 }
325}
326#endif /* LV_HAVE_AVX */
327
328#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */