68#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
87 unsigned int num_points)
91 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
93 *outVector++ = *inVector++ * (*phase);
94 (*phase) *= phase_inc;
100 *outVector++ = *inVector++ * (*phase);
101 (*phase) *= phase_inc;
120 unsigned int num_points)
124 const lv_32fc_t* inputVectorPtr = inVector;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
130 unsigned int i = 0, j = 0;
133 for (i = 0; i < 4; ++i) {
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((
float*)phasePtr);
143 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
145 input_vec = vld2q_f32((
float*)inputVectorPtr);
153 vst2q_f32((
float*)outputVectorPtr, output_vec);
155 outputVectorPtr += 4;
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
168 input_vec = vld2q_f32((
float*)inputVectorPtr);
176 vst2q_f32((
float*)outputVectorPtr, output_vec);
178 outputVectorPtr += 4;
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
192 vst2q_f32((
float*)phasePtr, phase_vec);
195 for (i = 0; i < num_points % 4; i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (phase_inc);
201 (*phase) = phasePtr[0];
208#include <smmintrin.h>
210static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
214 unsigned int num_points)
219 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
221 unsigned int i, j = 0;
223 for (i = 0; i < 2; ++i) {
224 phase_Ptr[i] *= incr;
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
233 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
299 if (num_points & 1) {
300 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 phase_Ptr[0] *= (phase_inc);
304 (*phase) = phase_Ptr[0];
311#include <smmintrin.h>
313static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
317 unsigned int num_points)
322 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
324 unsigned int i, j = 0;
326 for (i = 0; i < 2; ++i) {
327 phase_Ptr[i] *= incr;
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
339 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
405 if (num_points & 1) {
406 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 phase_Ptr[0] *= (phase_inc);
410 (*phase) = phase_Ptr[0];
417#include <immintrin.h>
424 unsigned int num_points)
429 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
431 unsigned int i, j = 0;
433 for (i = 0; i < 4; ++i) {
434 phase_Ptr[i] *= incr;
438 __m256 aVal, phase_Val, z;
440 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
442 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
451 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
454 aVal = _mm256_load_ps((
float*)aPtr);
459 _mm256_store_ps((
float*)cPtr, z);
468 aVal = _mm256_load_ps((
float*)aPtr);
473 _mm256_store_ps((
float*)cPtr, z);
482 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
483 (*phase) = phase_Ptr[0];
491#include <immintrin.h>
498 unsigned int num_points)
503 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
505 unsigned int i, j = 0;
507 for (i = 0; i < 4; ++i) {
508 phase_Ptr[i] *= incr;
512 __m256 aVal, phase_Val, z;
514 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
516 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
525 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
528 aVal = _mm256_loadu_ps((
float*)aPtr);
533 _mm256_storeu_ps((
float*)cPtr, z);
542 aVal = _mm256_loadu_ps((
float*)aPtr);
547 _mm256_storeu_ps((
float*)cPtr, z);
556 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
557 (*phase) = phase_Ptr[0];
563#if LV_HAVE_AVX && LV_HAVE_FMA
564#include <immintrin.h>
566static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
lv_32fc_t* outVector,
570 unsigned int num_points)
576 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
578 unsigned int i, j = 0;
580 for (i = 0; i < 4; ++i) {
581 phase_Ptr[i] *= incr;
585 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
587 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
588 inc_Val = _mm256_set_ps(
lv_cimag(incr),
597 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
600 aVal = _mm256_load_ps((
float*)aPtr);
602 yl = _mm256_moveldup_ps(phase_Val);
603 yh = _mm256_movehdup_ps(phase_Val);
604 ylp = _mm256_moveldup_ps(inc_Val);
605 yhp = _mm256_movehdup_ps(inc_Val);
610 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
611 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
612 tmp2 = _mm256_mul_ps(aVal, yh);
613 tmp2p = _mm256_mul_ps(phase_Val, yhp);
615 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
616 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
618 _mm256_store_ps((
float*)cPtr, z);
623 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
624 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
625 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
626 tmp2 = _mm256_sqrt_ps(tmp1);
627 phase_Val = _mm256_div_ps(phase_Val, tmp2);
630 aVal = _mm256_load_ps((
float*)aPtr);
632 yl = _mm256_moveldup_ps(phase_Val);
633 yh = _mm256_movehdup_ps(phase_Val);
634 ylp = _mm256_moveldup_ps(inc_Val);
635 yhp = _mm256_movehdup_ps(inc_Val);
640 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
641 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
642 tmp2 = _mm256_mul_ps(aVal, yh);
643 tmp2p = _mm256_mul_ps(phase_Val, yhp);
645 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
646 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
648 _mm256_store_ps((
float*)cPtr, z);
654 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
655 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
656 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
657 tmp2 = _mm256_sqrt_ps(tmp1);
658 phase_Val = _mm256_div_ps(phase_Val, tmp2);
661 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
662 for (i = 0; i < num_points % 4; ++i) {
663 *cPtr++ = *aPtr++ * phase_Ptr[0];
664 phase_Ptr[0] *= (phase_inc);
667 (*phase) = phase_Ptr[0];
672#if LV_HAVE_AVX && LV_HAVE_FMA
673#include <immintrin.h>
675static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
lv_32fc_t* outVector,
679 unsigned int num_points)
684 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
686 unsigned int i, j = 0;
688 for (i = 0; i < 4; ++i) {
689 phase_Ptr[i] *= incr;
693 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
695 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
696 inc_Val = _mm256_set_ps(
lv_cimag(incr),
705 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
708 aVal = _mm256_loadu_ps((
float*)aPtr);
710 yl = _mm256_moveldup_ps(phase_Val);
711 yh = _mm256_movehdup_ps(phase_Val);
712 ylp = _mm256_moveldup_ps(inc_Val);
713 yhp = _mm256_movehdup_ps(inc_Val);
718 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
719 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
720 tmp2 = _mm256_mul_ps(aVal, yh);
721 tmp2p = _mm256_mul_ps(phase_Val, yhp);
723 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
724 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
726 _mm256_storeu_ps((
float*)cPtr, z);
731 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
732 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
733 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
734 tmp2 = _mm256_sqrt_ps(tmp1);
735 phase_Val = _mm256_div_ps(phase_Val, tmp2);
738 aVal = _mm256_loadu_ps((
float*)aPtr);
740 yl = _mm256_moveldup_ps(phase_Val);
741 yh = _mm256_movehdup_ps(phase_Val);
742 ylp = _mm256_moveldup_ps(inc_Val);
743 yhp = _mm256_movehdup_ps(inc_Val);
748 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
749 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
750 tmp2 = _mm256_mul_ps(aVal, yh);
751 tmp2p = _mm256_mul_ps(phase_Val, yhp);
753 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
754 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
756 _mm256_storeu_ps((
float*)cPtr, z);
762 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
763 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
764 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
765 tmp2 = _mm256_sqrt_ps(tmp1);
766 phase_Val = _mm256_div_ps(phase_Val, tmp2);
769 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
770 for (i = 0; i < num_points % 4; ++i) {
771 *cPtr++ = *aPtr++ * phase_Ptr[0];
772 phase_Ptr[0] *= (phase_inc);
775 (*phase) = phase_Ptr[0];