Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft_int32.neonintrinsic.c
1 /*
2  * Copyright 2013-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 #define FFT4_FS_START \
40  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
41  ne10_int32_t tmp_r, tmp_i;
42 
43 
44 #define FFT4_FS \
45  s2_r = Fin[0].r - Fin[2].r; \
46  s2_i = Fin[0].i - Fin[2].i; \
47  tmp_r = Fin[0].r + Fin[2].r; \
48  tmp_i = Fin[0].i + Fin[2].i; \
49  s0_r = Fin[1].r + Fin[3].r; \
50  s0_i = Fin[1].i + Fin[3].i; \
51  s1_r = Fin[1].r - Fin[3].r; \
52  s1_i = Fin[1].i - Fin[3].i;
53 
54 #define FFT4_FS_SCALED \
55  s2_r = (Fin[0].r - Fin[2].r) >> 2; \
56  s2_i = (Fin[0].i - Fin[2].i) >> 2; \
57  tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
58  tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
59  s0_r = (Fin[1].r + Fin[3].r) >> 2; \
60  s0_i = (Fin[1].i + Fin[3].i) >> 2; \
61  s1_r = (Fin[1].r - Fin[3].r) >> 2; \
62  s1_i = (Fin[1].i - Fin[3].i) >> 2;
63 
64 #define FFT4_FWD_LS \
65  Fout[2].r = tmp_r - s0_r; \
66  Fout[2].i = tmp_i - s0_i; \
67  Fout[0].r = tmp_r + s0_r; \
68  Fout[0].i = tmp_i + s0_i; \
69  Fout[1].r = s2_r + s1_i; \
70  Fout[1].i = s2_i - s1_r; \
71  Fout[3].r = s2_r - s1_i; \
72  Fout[3].i = s2_i + s1_r;
73 
74 #define FFT4_INV_LS \
75  Fout[2].r = tmp_r - s0_r; \
76  Fout[2].i = tmp_i - s0_i; \
77  Fout[0].r = tmp_r + s0_r; \
78  Fout[0].i = tmp_i + s0_i; \
79  Fout[1].r = s2_r - s1_i; \
80  Fout[1].i = s2_i + s1_r; \
81  Fout[3].r = s2_r + s1_i; \
82  Fout[3].i = s2_i - s1_r;
83 
84 static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
86 
87 {
88  FFT4_FS_START
89  FFT4_FS
90  FFT4_FWD_LS
91 }
92 
93 static inline void ne10_fft4_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
95 
96 {
97  FFT4_FS_START
98  FFT4_FS
99  FFT4_INV_LS
100 }
101 static inline void ne10_fft4_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
102  ne10_fft_cpx_int32_t * Fin)
103 
104 {
105  FFT4_FS_START
106  FFT4_FS_SCALED
107  FFT4_FWD_LS
108 }
109 
110 static inline void ne10_fft4_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
111  ne10_fft_cpx_int32_t * Fin)
112 
113 {
114  FFT4_FS_START
115  FFT4_FS_SCALED
116  FFT4_INV_LS
117 }
118 
119 #define FFT8_FS_START \
120  ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
121  ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
122  const ne10_int32_t TW_81 = 1518500249;
123 
124 #define FFT8_FS \
125  s0_r = Fin[0].r + Fin[4].r; \
126  s0_i = Fin[0].i + Fin[4].i; \
127  s1_r = Fin[0].r - Fin[4].r; \
128  s1_i = Fin[0].i - Fin[4].i; \
129  s2_r = Fin[1].r + Fin[5].r; \
130  s2_i = Fin[1].i + Fin[5].i; \
131  s3_r = Fin[1].r - Fin[5].r; \
132  s3_i = Fin[1].i - Fin[5].i; \
133  s4_r = Fin[2].r + Fin[6].r; \
134  s4_i = Fin[2].i + Fin[6].i; \
135  s5_r = Fin[2].r - Fin[6].r; \
136  s5_i = Fin[2].i - Fin[6].i; \
137  s6_r = Fin[3].r + Fin[7].r; \
138  s6_i = Fin[3].i + Fin[7].i; \
139  s7_r = Fin[3].r - Fin[7].r; \
140  s7_i = Fin[3].i - Fin[7].i;
141 
142 #define FFT8_FS_SCALED \
143  s0_r = (Fin[0].r + Fin[4].r) >> 3; \
144  s0_i = (Fin[0].i + Fin[4].i) >> 3; \
145  s1_r = (Fin[0].r - Fin[4].r) >> 3; \
146  s1_i = (Fin[0].i - Fin[4].i) >> 3; \
147  s2_r = (Fin[1].r + Fin[5].r) >> 3; \
148  s2_i = (Fin[1].i + Fin[5].i) >> 3; \
149  s3_r = (Fin[1].r - Fin[5].r) >> 3; \
150  s3_i = (Fin[1].i - Fin[5].i) >> 3; \
151  s4_r = (Fin[2].r + Fin[6].r) >> 3; \
152  s4_i = (Fin[2].i + Fin[6].i) >> 3; \
153  s5_r = (Fin[2].r - Fin[6].r) >> 3; \
154  s5_i = (Fin[2].i - Fin[6].i) >> 3; \
155  s6_r = (Fin[3].r + Fin[7].r) >> 3; \
156  s6_i = (Fin[3].i + Fin[7].i) >> 3; \
157  s7_r = (Fin[3].r - Fin[7].r) >> 3; \
158  s7_i = (Fin[3].i - Fin[7].i) >> 3;
159 
160 
161 #define FFT8_FWD_LS \
162  t0_r = s0_r - s4_r; \
163  t0_i = s0_i - s4_i; \
164  t1_r = s0_r + s4_r; \
165  t1_i = s0_i + s4_i; \
166  t2_r = s2_r + s6_r; \
167  t2_i = s2_i + s6_i; \
168  t3_r = s2_r - s6_r; \
169  t3_i = s2_i - s6_i; \
170  Fout[0].r = t1_r + t2_r; \
171  Fout[0].i = t1_i + t2_i; \
172  Fout[4].r = t1_r - t2_r; \
173  Fout[4].i = t1_i - t2_i; \
174  Fout[2].r = t0_r + t3_i; \
175  Fout[2].i = t0_i - t3_r; \
176  Fout[6].r = t0_r - t3_i; \
177  Fout[6].i = t0_i + t3_r; \
178  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
179  t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
180  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
181  t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
182  t0_r = s1_r - s5_i; \
183  t0_i = s1_i + s5_r; \
184  t1_r = s1_r + s5_i; \
185  t1_i = s1_i - s5_r; \
186  t2_r = t4_r - t5_r; \
187  t2_i = t4_i - t5_i; \
188  t3_r = t4_r + t5_r; \
189  t3_i = t4_i + t5_i; \
190  Fout[1].r = t1_r + t2_r; \
191  Fout[1].i = t1_i + t2_i; \
192  Fout[5].r = t1_r - t2_r; \
193  Fout[5].i = t1_i - t2_i; \
194  Fout[3].r = t0_r + t3_i; \
195  Fout[3].i = t0_i - t3_r; \
196  Fout[7].r = t0_r - t3_i; \
197  Fout[7].i = t0_i + t3_r;
198 
199 #define FFT8_INV_LS \
200  t0_r = s0_r - s4_r; \
201  t0_i = s0_i - s4_i; \
202  t1_r = s0_r + s4_r; \
203  t1_i = s0_i + s4_i; \
204  t2_r = s2_r + s6_r; \
205  t2_i = s2_i + s6_i; \
206  t3_r = s2_r - s6_r; \
207  t3_i = s2_i - s6_i; \
208  Fout[0].r = t1_r + t2_r; \
209  Fout[0].i = t1_i + t2_i; \
210  Fout[4].r = t1_r - t2_r; \
211  Fout[4].i = t1_i - t2_i; \
212  Fout[2].r = t0_r - t3_i; \
213  Fout[2].i = t0_i + t3_r; \
214  Fout[6].r = t0_r + t3_i; \
215  Fout[6].i = t0_i - t3_r; \
216  t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
217  t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
218  t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
219  t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
220  t0_r = s1_r + s5_i; \
221  t0_i = s1_i - s5_r; \
222  t1_r = s1_r - s5_i; \
223  t1_i = s1_i + s5_r; \
224  t2_r = t4_r - t5_r; \
225  t2_i = t4_i - t5_i; \
226  t3_r = t4_r + t5_r; \
227  t3_i = t4_i + t5_i; \
228  Fout[1].r = t1_r + t2_r; \
229  Fout[1].i = t1_i + t2_i; \
230  Fout[5].r = t1_r - t2_r; \
231  Fout[5].i = t1_i - t2_i; \
232  Fout[3].r = t0_r - t3_i; \
233  Fout[3].i = t0_i + t3_r; \
234  Fout[7].r = t0_r + t3_i; \
235  Fout[7].i = t0_i - t3_r;
236 
237 static inline void ne10_fft8_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
238  ne10_fft_cpx_int32_t * Fin)
239 
240 {
241  FFT8_FS_START
242  FFT8_FS
243  FFT8_FWD_LS
244 }
245 
246 static inline void ne10_fft8_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
247  ne10_fft_cpx_int32_t * Fin)
248 
249 {
250  FFT8_FS_START
251  FFT8_FS
252  FFT8_INV_LS
253 }
254 static inline void ne10_fft8_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
255  ne10_fft_cpx_int32_t * Fin)
256 
257 {
258  FFT8_FS_START
259  FFT8_FS_SCALED
260  FFT8_FWD_LS
261 }
262 
263 static inline void ne10_fft8_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
264  ne10_fft_cpx_int32_t * Fin)
265 
266 {
267  FFT8_FS_START
268  FFT8_FS_SCALED
269  FFT8_INV_LS
270 }
271 #define FFT16_FS_START \
272  ne10_fft_cpx_int32_t *tw1, *tw2, *tw3; \
273  int32_t *p_src0, *p_src4, *p_src8, *p_src12; \
274  int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; \
275  int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i; \
276  int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d; \
277  int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
278 
279 #define FFT16_LS_START \
280  int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; \
281  int32_t *p_tw1, *p_tw2, *p_tw3; \
282  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; \
283  int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; \
284  int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; \
285  int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; \
286  int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; \
287  int32x4x2_t q2_tw1, q2_tw2, q2_tw3; \
288  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
289  int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
290 
291 #define FFT16_FS \
292  p_src0 = (int32_t*) (& (Fin[0])); \
293  p_src4 = (int32_t*) (& (Fin[4])); \
294  p_src8 = (int32_t*) (& (Fin[8])); \
295  p_src12 = (int32_t*) (& (Fin[12])); \
296  q2_in_0123 = vld2q_s32 (p_src0); \
297  q2_in_4567 = vld2q_s32 (p_src4); \
298  q2_in_89ab = vld2q_s32 (p_src8); \
299  q2_in_cdef = vld2q_s32 (p_src12); \
300  q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
301  q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
302  q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
303  q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
304  q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
305  q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
306  q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
307  q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
308  q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r); \
309  q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i); \
310  q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r); \
311  q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
312 
313 #define FFT16_FS_SCALED \
314  p_src0 = (int32_t*) (& (Fin[0])); \
315  p_src4 = (int32_t*) (& (Fin[4])); \
316  p_src8 = (int32_t*) (& (Fin[8])); \
317  p_src12 = (int32_t*) (& (Fin[12])); \
318  q2_in_0123 = vld2q_s32 (p_src0); \
319  q2_in_4567 = vld2q_s32 (p_src4); \
320  q2_in_89ab = vld2q_s32 (p_src8); \
321  q2_in_cdef = vld2q_s32 (p_src12); \
322  q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
323  q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
324  q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
325  q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
326  q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
327  q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
328  q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
329  q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
330  q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r); \
331  q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i); \
332  q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r); \
333  q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
334 
335 #define FFT16_LS_LOAD \
336  tw1 = twiddles; \
337  tw2 = twiddles + 4; \
338  tw3 = twiddles + 8; \
339  p_dst0 = (int32_t*) (&Fout[0]); \
340  p_dst1 = (int32_t*) (&Fout[4]); \
341  p_dst2 = (int32_t*) (&Fout[8]); \
342  p_dst3 = (int32_t*) (&Fout[12]); \
343  p_tw1 = (int32_t*) tw1; \
344  p_tw2 = (int32_t*) tw2; \
345  p_tw3 = (int32_t*) tw3; \
346  q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d); \
347  q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d); \
348  q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf); \
349  q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf); \
350  q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0])); \
351  q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0])); \
352  q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0])); \
353  q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0])); \
354  q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1])); \
355  q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1])); \
356  q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1])); \
357  q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1])); \
358  q2_tw1 = vld2q_s32 (p_tw1); \
359  q2_tw2 = vld2q_s32 (p_tw2); \
360  q2_tw3 = vld2q_s32 (p_tw3);
361 
362 #define FFT16_FWD_LS \
363  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
364  q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
365  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
366  q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
367  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
368  q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); \
369  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
370  q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
371  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
372  q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
373  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
374  q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
375 
376 #define FFT16_INV_LS \
377  q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
378  q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
379  q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
380  q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
381  q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
382  q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); \
383  q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
384  q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
385  q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
386  q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
387  q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
388  q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
389 
390 #define FFT16_FWD_LS_S0 \
391  q_s0_r = vsubq_s32 (q_s0_r, q_tmp0); \
392  q_s0_i = vaddq_s32 (q_s0_i, q_tmp1); \
393  q_s1_r = vsubq_s32 (q_s1_r, q_tmp2); \
394  q_s1_i = vaddq_s32 (q_s1_i, q_tmp3); \
395  q_s2_r = vsubq_s32 (q_s2_r, q_tmp4); \
396  q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
397 
398 #define FFT16_INV_LS_S0 \
399  q_s0_r = vaddq_s32 (q_s0_r, q_tmp0); \
400  q_s0_i = vsubq_s32 (q_s0_i, q_tmp1); \
401  q_s1_r = vaddq_s32 (q_s1_r, q_tmp2); \
402  q_s1_i = vsubq_s32 (q_s1_i, q_tmp3); \
403  q_s2_r = vaddq_s32 (q_s2_r, q_tmp4); \
404  q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
405 
406 #define FFT16_LS_02 \
407  q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r); \
408  q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i); \
409  q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r); \
410  q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i); \
411  q_s3_r = vaddq_s32 (q_s0_r, q_s2_r); \
412  q_s3_i = vaddq_s32 (q_s0_i, q_s2_i); \
413  q_s4_r = vsubq_s32 (q_s0_r, q_s2_r); \
414  q_s4_i = vsubq_s32 (q_s0_i, q_s2_i); \
415  q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r); \
416  q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i); \
417  q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r); \
418  q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
419 
420 
421 #define FFT16_LS_02_SCALED \
422  q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r); \
423  q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i); \
424  q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r); \
425  q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i); \
426  q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r); \
427  q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i); \
428  q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r); \
429  q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i); \
430  q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r); \
431  q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i); \
432  q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r); \
433  q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
434 
435 #define FFT16_ST \
436  vst2q_s32 (p_dst0, q2_out_0123); \
437  vst2q_s32 (p_dst1, q2_out_4567); \
438  vst2q_s32 (p_dst2, q2_out_89ab); \
439  vst2q_s32 (p_dst3, q2_out_cdef);
440 
441 static void ne10_fft16_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
442  ne10_fft_cpx_int32_t * Fin,
443  ne10_fft_cpx_int32_t * twiddles)
444 {
445  // the first stage
446  FFT16_FS_START
447  FFT16_FS
448  q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
449  q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
450  q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
451  q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
452 
453  // second stages
454  FFT16_LS_START
455  FFT16_LS_LOAD
456  FFT16_FWD_LS
457  FFT16_FWD_LS_S0
458  FFT16_LS_02
459 
460  q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
461  q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
462  q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
463  q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
464 
465  FFT16_ST
466 }
467 
468 static void ne10_fft16_backward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
469  ne10_fft_cpx_int32_t * Fin,
470  ne10_fft_cpx_int32_t * twiddles)
471 {
472  // the first stage
473  FFT16_FS_START
474  FFT16_FS
475  q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
476  q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
477  q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
478  q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
479 
480  // second stages
481  FFT16_LS_START
482  FFT16_LS_LOAD
483  FFT16_INV_LS
484  FFT16_INV_LS_S0
485  FFT16_LS_02
486 
487  q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
488  q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
489  q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
490  q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
491 
492  FFT16_ST
493 }
494 
495 static void ne10_fft16_forward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
496  ne10_fft_cpx_int32_t * Fin,
497  ne10_fft_cpx_int32_t * twiddles)
498 {
499  // the first stage
500  FFT16_FS_START
501  FFT16_FS_SCALED
502  q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
503  q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
504  q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
505  q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
506 
507  // second stages
508  FFT16_LS_START
509  FFT16_LS_LOAD
510  FFT16_FWD_LS
511  FFT16_FWD_LS_S0
512  FFT16_LS_02_SCALED
513 
514  q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
515  q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
516  q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
517  q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
518 
519  FFT16_ST
520 }
521 
522 static void ne10_fft16_backward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
523  ne10_fft_cpx_int32_t * Fin,
524  ne10_fft_cpx_int32_t * twiddles)
525 {
526  // the first stage
527  FFT16_FS_START
528  FFT16_FS_SCALED
529  q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
530  q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
531  q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
532  q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
533 
534  // second stages
535  FFT16_LS_START
536  FFT16_LS_LOAD
537  FFT16_INV_LS
538  FFT16_INV_LS_S0
539  FFT16_LS_02_SCALED
540 
541  q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
542  q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
543  q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
544  q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
545 
546  FFT16_ST
547 }
548 
549 
550 #define RADIX8x4_START \
551  ne10_int32_t f_count; \
552  ne10_int32_t src_step = stride << 1; \
553  const ne10_int32_t TW_81 = 1518500249; \
554  const ne10_int32_t TW_81N = -1518500249; \
555  int32_t *p_src, *p_dst; \
556  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7; \
557  int32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i; \
558  int32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i; \
559  int32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i; \
560  int32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i; \
561  int32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i; \
562  int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
563  int32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i; \
564  int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
565  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7; \
566  int32x4_t q_tw_81, q_tw_81n; \
567  p_src = (int32_t *) Fin; \
568  p_dst = (int32_t *) Fout;
569 
570 
571 #define RADIX8x4_LOAD \
572  q2_in0 = vld2q_s32 (p_src); \
573  p_src += src_step; \
574  q2_in2 = vld2q_s32 (p_src); \
575  p_src += src_step; \
576  q2_in4 = vld2q_s32 (p_src); \
577  p_src += src_step; \
578  q2_in6 = vld2q_s32 (p_src); \
579  p_src += src_step; \
580  q2_in1 = vld2q_s32 (p_src); \
581  p_src += src_step; \
582  q2_in3 = vld2q_s32 (p_src); \
583  p_src += src_step; \
584  q2_in5 = vld2q_s32 (p_src); \
585  p_src += src_step; \
586  q2_in7 = vld2q_s32 (p_src); \
587  p_src += src_step;
588 
589 #define RADIX8x4_STORE \
590  q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
591  q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
592  q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
593  q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
594  q2_tmp4 = vtrnq_s32 (q_out4_r, q_out5_r); \
595  q2_tmp5 = vtrnq_s32 (q_out4_i, q_out5_i); \
596  q2_tmp6 = vtrnq_s32 (q_out6_r, q_out7_r); \
597  q2_tmp7 = vtrnq_s32 (q_out6_i, q_out7_i); \
598  q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
599  q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
600  q2_out2.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
601  q2_out2.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
602  q2_out4.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
603  q2_out4.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
604  q2_out6.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
605  q2_out6.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
606  q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[0]), vget_low_s32 (q2_tmp6.val[0])); \
607  q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[0]), vget_low_s32 (q2_tmp7.val[0])); \
608  q2_out3.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[1]), vget_low_s32 (q2_tmp6.val[1])); \
609  q2_out3.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[1]), vget_low_s32 (q2_tmp7.val[1])); \
610  q2_out5.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[0]), vget_high_s32 (q2_tmp6.val[0])); \
611  q2_out5.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[0]), vget_high_s32 (q2_tmp7.val[0])); \
612  q2_out7.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[1]), vget_high_s32 (q2_tmp6.val[1])); \
613  q2_out7.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[1]), vget_high_s32 (q2_tmp7.val[1])); \
614  vst2q_s32 (p_dst, q2_out0); \
615  p_dst += 8; \
616  vst2q_s32 (p_dst, q2_out1); \
617  p_dst += 8; \
618  vst2q_s32 (p_dst, q2_out2); \
619  p_dst += 8; \
620  vst2q_s32 (p_dst, q2_out3); \
621  p_dst += 8; \
622  vst2q_s32 (p_dst, q2_out4); \
623  p_dst += 8; \
624  vst2q_s32 (p_dst, q2_out5); \
625  p_dst += 8; \
626  vst2q_s32 (p_dst, q2_out6); \
627  p_dst += 8; \
628  vst2q_s32 (p_dst, q2_out7); \
629  p_dst += 8; \
630  p_src = p_src - src_step * 8 + 8;
631 
632 #define RADIX8x4_FS_S0 \
633  q_sin0_r = vaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
634  q_sin0_i = vaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
635  q_sin1_r = vsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
636  q_sin1_i = vsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
637  q_sin2_r = vaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
638  q_sin2_i = vaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
639  q_sin3_r = vsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
640  q_sin3_i = vsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
641  q_sin4_r = vaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
642  q_sin4_i = vaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
643  q_sin5_r = vsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
644  q_sin5_i = vsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
645  q_sin6_r = vaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
646  q_sin6_i = vaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
647  q_sin7_r = vsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
648  q_sin7_i = vsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
649 
650 #define RADIX8x4_FWD_S357 \
651  q_tw_81 = vdupq_n_s32 (TW_81); \
652  q_tw_81n = vdupq_n_s32 (TW_81N); \
653  q_s5_r = q_sin5_i; \
654  q_s5_i = vnegq_s32 (q_sin5_r); \
655  q_s3_r = vaddq_s32 (q_sin3_r, q_sin3_i); \
656  q_s3_i = vsubq_s32 (q_sin3_i, q_sin3_r); \
657  q_s7_r = vsubq_s32 (q_sin7_r, q_sin7_i); \
658  q_s7_i = vaddq_s32 (q_sin7_i, q_sin7_r); \
659  q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
660  q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
661  q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
662  q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
663 
664 #define RADIX8x4_INV_S357 \
665  q_tw_81 = vdupq_n_s32 (TW_81); \
666  q_tw_81n = vdupq_n_s32 (TW_81N); \
667  q_s5_r = vnegq_s32 (q_sin5_i); \
668  q_s5_i = q_sin5_r; \
669  q_s3_r = vsubq_s32 (q_sin3_r, q_sin3_i); \
670  q_s3_i = vaddq_s32 (q_sin3_i, q_sin3_r); \
671  q_s7_r = vaddq_s32 (q_sin7_r, q_sin7_i); \
672  q_s7_i = vsubq_s32 (q_sin7_i, q_sin7_r); \
673  q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
674  q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
675  q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
676  q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
677 
678 #define RADIX8x4_LS_02 \
679  q_s8_r = vaddq_s32 (q_sin0_r, q_sin4_r); \
680  q_s8_i = vaddq_s32 (q_sin0_i, q_sin4_i); \
681  q_s9_r = vaddq_s32 (q_sin1_r, q_s5_r); \
682  q_s9_i = vaddq_s32 (q_sin1_i, q_s5_i); \
683  q_s10_r = vsubq_s32 (q_sin0_r, q_sin4_r); \
684  q_s10_i = vsubq_s32 (q_sin0_i, q_sin4_i); \
685  q_s11_r = vsubq_s32 (q_sin1_r, q_s5_r); \
686  q_s11_i = vsubq_s32 (q_sin1_i, q_s5_i); \
687  q_s12_r = vaddq_s32 (q_sin2_r, q_sin6_r); \
688  q_s12_i = vaddq_s32 (q_sin2_i, q_sin6_i); \
689  q_s13_r = vaddq_s32 (q_s3_r, q_s7_r); \
690  q_s13_i = vaddq_s32 (q_s3_i, q_s7_i); \
691  q_s14_r = vsubq_s32 (q_sin2_r, q_sin6_r); \
692  q_s14_i = vsubq_s32 (q_sin2_i, q_sin6_i); \
693  q_s15_r = vsubq_s32 (q_s3_r, q_s7_r); \
694  q_s15_i = vsubq_s32 (q_s3_i, q_s7_i); \
695  q_out4_r = vsubq_s32 (q_s8_r, q_s12_r); \
696  q_out4_i = vsubq_s32 (q_s8_i, q_s12_i); \
697  q_out5_r = vsubq_s32 (q_s9_r, q_s13_r); \
698  q_out5_i = vsubq_s32 (q_s9_i, q_s13_i); \
699  q_out0_r = vaddq_s32 (q_s8_r, q_s12_r); \
700  q_out0_i = vaddq_s32 (q_s8_i, q_s12_i); \
701  q_out1_r = vaddq_s32 (q_s9_r, q_s13_r); \
702  q_out1_i = vaddq_s32 (q_s9_i, q_s13_i);
703 
704 #define RADIX8x4_FS_S0_SCALED \
705  q_sin0_r = vhaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
706  q_sin0_i = vhaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
707  q_sin1_r = vhsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
708  q_sin1_i = vhsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
709  q_sin2_r = vhaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
710  q_sin2_i = vhaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
711  q_sin3_r = vhsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
712  q_sin3_i = vhsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
713  q_sin4_r = vhaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
714  q_sin4_i = vhaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
715  q_sin5_r = vhsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
716  q_sin5_i = vhsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
717  q_sin6_r = vhaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
718  q_sin6_i = vhaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
719  q_sin7_r = vhsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
720  q_sin7_i = vhsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
721 
722 #define RADIX8x4_LS_02_SCALED \
723  q_s8_r = vhaddq_s32 (q_sin0_r, q_sin4_r); \
724  q_s8_i = vhaddq_s32 (q_sin0_i, q_sin4_i); \
725  q_s9_r = vhaddq_s32 (q_sin1_r, q_s5_r); \
726  q_s9_i = vhaddq_s32 (q_sin1_i, q_s5_i); \
727  q_s10_r = vhsubq_s32 (q_sin0_r, q_sin4_r); \
728  q_s10_i = vhsubq_s32 (q_sin0_i, q_sin4_i); \
729  q_s11_r = vhsubq_s32 (q_sin1_r, q_s5_r); \
730  q_s11_i = vhsubq_s32 (q_sin1_i, q_s5_i); \
731  q_s12_r = vhaddq_s32 (q_sin2_r, q_sin6_r); \
732  q_s12_i = vhaddq_s32 (q_sin2_i, q_sin6_i); \
733  q_s13_r = vhaddq_s32 (q_s3_r, q_s7_r); \
734  q_s13_i = vhaddq_s32 (q_s3_i, q_s7_i); \
735  q_s14_r = vhsubq_s32 (q_sin2_r, q_sin6_r); \
736  q_s14_i = vhsubq_s32 (q_sin2_i, q_sin6_i); \
737  q_s15_r = vhsubq_s32 (q_s3_r, q_s7_r); \
738  q_s15_i = vhsubq_s32 (q_s3_i, q_s7_i); \
739  q_out4_r = vhsubq_s32 (q_s8_r, q_s12_r); \
740  q_out4_i = vhsubq_s32 (q_s8_i, q_s12_i); \
741  q_out5_r = vhsubq_s32 (q_s9_r, q_s13_r); \
742  q_out5_i = vhsubq_s32 (q_s9_i, q_s13_i); \
743  q_out0_r = vhaddq_s32 (q_s8_r, q_s12_r); \
744  q_out0_i = vhaddq_s32 (q_s8_i, q_s12_i); \
745  q_out1_r = vhaddq_s32 (q_s9_r, q_s13_r); \
746  q_out1_i = vhaddq_s32 (q_s9_i, q_s13_i);
747 
748 
749 static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
750  ne10_fft_cpx_int32_t * Fin,
751  ne10_int32_t stride)
752 {
753  RADIX8x4_START
754 
755  for (f_count = 0; f_count < stride; f_count += 4)
756  {
757  RADIX8x4_LOAD
758  RADIX8x4_FS_S0
759 
760 
761  // radix 4 butterfly without twiddles
762  RADIX8x4_FWD_S357
763  RADIX8x4_LS_02
764 
765  q_out2_r = vaddq_s32 (q_s10_r, q_s14_i);
766  q_out2_i = vsubq_s32 (q_s10_i, q_s14_r);
767  q_out3_r = vaddq_s32 (q_s11_r, q_s15_i);
768  q_out3_i = vsubq_s32 (q_s11_i, q_s15_r);
769  q_out6_r = vsubq_s32 (q_s10_r, q_s14_i);
770  q_out6_i = vaddq_s32 (q_s10_i, q_s14_r);
771  q_out7_r = vsubq_s32 (q_s11_r, q_s15_i);
772  q_out7_i = vaddq_s32 (q_s11_i, q_s15_r);
773 
774  RADIX8x4_STORE
775  } // f_count
776 }
777 
778 static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
779  ne10_fft_cpx_int32_t * Fin,
780  ne10_int32_t stride)
781 {
782  RADIX8x4_START
783 
784  for (f_count = 0; f_count < stride; f_count += 4)
785  {
786  RADIX8x4_LOAD
787  RADIX8x4_FS_S0
788 
789  // radix 4 butterfly without twiddles
790  RADIX8x4_INV_S357
791  RADIX8x4_LS_02
792 
793  q_out2_r = vsubq_s32 (q_s10_r, q_s14_i);
794  q_out2_i = vaddq_s32 (q_s10_i, q_s14_r);
795  q_out3_r = vsubq_s32 (q_s11_r, q_s15_i);
796  q_out3_i = vaddq_s32 (q_s11_i, q_s15_r);
797  q_out6_r = vaddq_s32 (q_s10_r, q_s14_i);
798  q_out6_i = vsubq_s32 (q_s10_i, q_s14_r);
799  q_out7_r = vaddq_s32 (q_s11_r, q_s15_i);
800  q_out7_i = vsubq_s32 (q_s11_i, q_s15_r);
801 
802  RADIX8x4_STORE
803  } // f_count
804 }
805 static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
806  ne10_fft_cpx_int32_t * Fin,
807  ne10_int32_t stride)
808 {
809  RADIX8x4_START
810 
811  for (f_count = 0; f_count < stride; f_count += 4)
812  {
813  RADIX8x4_LOAD
814  RADIX8x4_FS_S0_SCALED
815 
816  // radix 4 butterfly without twiddles
817  RADIX8x4_FWD_S357
818  RADIX8x4_LS_02_SCALED
819 
820  q_out2_r = vhaddq_s32 (q_s10_r, q_s14_i);
821  q_out2_i = vhsubq_s32 (q_s10_i, q_s14_r);
822  q_out3_r = vhaddq_s32 (q_s11_r, q_s15_i);
823  q_out3_i = vhsubq_s32 (q_s11_i, q_s15_r);
824  q_out6_r = vhsubq_s32 (q_s10_r, q_s14_i);
825  q_out6_i = vhaddq_s32 (q_s10_i, q_s14_r);
826  q_out7_r = vhsubq_s32 (q_s11_r, q_s15_i);
827  q_out7_i = vhaddq_s32 (q_s11_i, q_s15_r);
828 
829  RADIX8x4_STORE
830  } // f_count
831 }
832 
833 static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
834  ne10_fft_cpx_int32_t * Fin,
835  ne10_int32_t stride)
836 {
837  RADIX8x4_START
838 
839  for (f_count = 0; f_count < stride; f_count += 4)
840  {
841  RADIX8x4_LOAD
842  RADIX8x4_FS_S0_SCALED
843 
844  // radix 4 butterfly without twiddles
845  RADIX8x4_INV_S357
846  RADIX8x4_LS_02_SCALED
847 
848  q_out2_r = vhsubq_s32 (q_s10_r, q_s14_i);
849  q_out2_i = vhaddq_s32 (q_s10_i, q_s14_r);
850  q_out3_r = vhsubq_s32 (q_s11_r, q_s15_i);
851  q_out3_i = vhaddq_s32 (q_s11_i, q_s15_r);
852  q_out6_r = vhaddq_s32 (q_s10_r, q_s14_i);
853  q_out6_i = vhsubq_s32 (q_s10_i, q_s14_r);
854  q_out7_r = vhaddq_s32 (q_s11_r, q_s15_i);
855  q_out7_i = vhsubq_s32 (q_s11_i, q_s15_r);
856 
857  RADIX8x4_STORE
858  } // f_count
859 }
860 
861 #define RADIX4x4_WITHOUT_TW_START \
862  ne10_int32_t f_count; \
863  ne10_int32_t src_step = stride << 1; \
864  int32_t *p_src, *p_dst; \
865  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
866  int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
867  int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
868  int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
869  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
870  p_src = (int32_t *) Fin; \
871  p_dst = (int32_t *) Fout;
872 
873 #define RADIX4x4_WITHOUT_TW_LOAD \
874  q2_in0 = vld2q_s32 (p_src); \
875  p_src += src_step; \
876  q2_in1 = vld2q_s32 (p_src); \
877  p_src += src_step; \
878  q2_in2 = vld2q_s32 (p_src); \
879  p_src += src_step; \
880  q2_in3 = vld2q_s32 (p_src); \
881  p_src += src_step;
882 
883 #define RADIX4x4_WITHOUT_TW_STORE \
884  q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
885  q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
886  q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
887  q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
888  q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
889  q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
890  q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
891  q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
892  q2_out2.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
893  q2_out2.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
894  q2_out3.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
895  q2_out3.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
896  vst2q_s32 (p_dst, q2_out0); \
897  p_dst += 8; \
898  vst2q_s32 (p_dst, q2_out1); \
899  p_dst += 8; \
900  vst2q_s32 (p_dst, q2_out2); \
901  p_dst += 8; \
902  vst2q_s32 (p_dst, q2_out3); \
903  p_dst += 8; \
904  p_src = p_src - src_step * 4 + 8;
905 
906 #define RADIX4x4_WITHOUT_TW_S0 \
907  q_s0_r = vaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
908  q_s0_i = vaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
909  q_s1_r = vsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
910  q_s1_i = vsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
911  q_s2_r = vaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
912  q_s2_i = vaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
913  q_s3_r = vsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
914  q_s3_i = vsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
915  q_out2_r = vsubq_s32 (q_s0_r, q_s2_r); \
916  q_out2_i = vsubq_s32 (q_s0_i, q_s2_i); \
917  q_out0_r = vaddq_s32 (q_s0_r, q_s2_r); \
918  q_out0_i = vaddq_s32 (q_s0_i, q_s2_i);
919 
920 #define RADIX4x4_WITHOUT_TW_S0_SCALED \
921  q_s0_r = vhaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
922  q_s0_i = vhaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
923  q_s1_r = vhsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
924  q_s1_i = vhsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
925  q_s2_r = vhaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
926  q_s2_i = vhaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
927  q_s3_r = vhsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
928  q_s3_i = vhsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
929  q_out2_r = vhsubq_s32 (q_s0_r, q_s2_r); \
930  q_out2_i = vhsubq_s32 (q_s0_i, q_s2_i); \
931  q_out0_r = vhaddq_s32 (q_s0_r, q_s2_r); \
932  q_out0_i = vhaddq_s32 (q_s0_i, q_s2_i);
933 
934 
935 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
936  ne10_fft_cpx_int32_t * Fin,
937  ne10_int32_t stride)
938 {
939  RADIX4x4_WITHOUT_TW_START
940 
941  for (f_count = 0; f_count < stride; f_count += 4)
942  {
943  // load
944  RADIX4x4_WITHOUT_TW_LOAD
945 
946  // radix 4 butterfly without twiddles
947  RADIX4x4_WITHOUT_TW_S0
948 
949  q_out1_r = vaddq_s32 (q_s1_r, q_s3_i);
950  q_out1_i = vsubq_s32 (q_s1_i, q_s3_r);
951  q_out3_r = vsubq_s32 (q_s1_r, q_s3_i);
952  q_out3_i = vaddq_s32 (q_s1_i, q_s3_r);
953 
954  RADIX4x4_WITHOUT_TW_STORE
955  }
956 }
957 
958 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
959  ne10_fft_cpx_int32_t * Fin,
960  ne10_int32_t stride)
961 {
962  RADIX4x4_WITHOUT_TW_START
963 
964  for (f_count = 0; f_count < stride; f_count += 4)
965  {
966  // load
967  RADIX4x4_WITHOUT_TW_LOAD
968 
969  // radix 4 butterfly without twiddles
970  RADIX4x4_WITHOUT_TW_S0
971 
972  q_out1_r = vsubq_s32 (q_s1_r, q_s3_i);
973  q_out1_i = vaddq_s32 (q_s1_i, q_s3_r);
974  q_out3_r = vaddq_s32 (q_s1_r, q_s3_i);
975  q_out3_i = vsubq_s32 (q_s1_i, q_s3_r);
976 
977  RADIX4x4_WITHOUT_TW_STORE
978  }
979 }
980 
981 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
982  ne10_fft_cpx_int32_t * Fin,
983  ne10_int32_t stride)
984 {
985  RADIX4x4_WITHOUT_TW_START
986 
987  for (f_count = 0; f_count < stride; f_count += 4)
988  {
989  // load
990  RADIX4x4_WITHOUT_TW_LOAD
991 
992  // radix 4 butterfly without twiddles
993  RADIX4x4_WITHOUT_TW_S0_SCALED
994 
995  q_out1_r = vhaddq_s32 (q_s1_r, q_s3_i);
996  q_out1_i = vhsubq_s32 (q_s1_i, q_s3_r);
997  q_out3_r = vhsubq_s32 (q_s1_r, q_s3_i);
998  q_out3_i = vhaddq_s32 (q_s1_i, q_s3_r);
999 
1000  RADIX4x4_WITHOUT_TW_STORE
1001  }
1002 }
1003 
1004 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1005  ne10_fft_cpx_int32_t * Fin,
1006  ne10_int32_t stride)
1007 {
1008  RADIX4x4_WITHOUT_TW_START
1009 
1010  for (f_count = 0; f_count < stride; f_count += 4)
1011  {
1012  // load
1013  RADIX4x4_WITHOUT_TW_LOAD
1014 
1015  // radix 4 butterfly without twiddles
1016  RADIX4x4_WITHOUT_TW_S0_SCALED
1017 
1018  q_out1_r = vhsubq_s32 (q_s1_r, q_s3_i);
1019  q_out1_i = vhaddq_s32 (q_s1_i, q_s3_r);
1020  q_out3_r = vhaddq_s32 (q_s1_r, q_s3_i);
1021  q_out3_i = vhsubq_s32 (q_s1_i, q_s3_r);
1022 
1023  RADIX4x4_WITHOUT_TW_STORE
1024  }
1025 }
1026 
1027 #define RADIX4x4_WITH_TW_START \
1028  ne10_int32_t m_count; \
1029  ne10_int32_t src_step = src_stride << 1; \
1030  ne10_int32_t dst_step = dst_stride << 1; \
1031  ne10_int32_t tw_step = mstride << 1; \
1032  int32_t *p_src, *p_dst, *p_tw; \
1033  int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
1034  int32x4x2_t q2_tw0, q2_tw1, q2_tw2; \
1035  int32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
1036  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
1037  int32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i; \
1038  int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
1039  p_src = (int32_t *) Fin; \
1040  p_dst = (int32_t *) Fout; \
1041  p_tw = (int32_t *) tw;
1042 
1043 #define RADIX4x4_WITH_TW_LOAD \
1044  q2_in0 = vld2q_s32 (p_src); \
1045  p_src += src_step; \
1046  q2_in1 = vld2q_s32 (p_src); \
1047  p_src += src_step; \
1048  q2_in2 = vld2q_s32 (p_src); \
1049  p_src += src_step; \
1050  q2_in3 = vld2q_s32 (p_src); \
1051  p_src += src_step; \
1052  q2_tw0 = vld2q_s32 (p_tw); \
1053  p_tw += tw_step; \
1054  q2_tw1 = vld2q_s32 (p_tw); \
1055  p_tw += tw_step; \
1056  q2_tw2 = vld2q_s32 (p_tw); \
1057  q_s1_r = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[0]); \
1058  q_s1_i = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[0]); \
1059  q_s2_r = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[0]); \
1060  q_s2_i = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[0]); \
1061  q_s3_r = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[0]); \
1062  q_s3_i = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[0]); \
1063  q_tmp0 = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[1]); \
1064  q_tmp1 = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[1]); \
1065  q_tmp2 = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[1]); \
1066  q_tmp3 = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[1]); \
1067  q_tmp4 = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[1]); \
1068  q_tmp5 = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[1]);
1069 
1070 #define RADIX4x4_WITH_TW_STORE \
1071  vst2q_s32 (p_dst, q2_out0); \
1072  p_dst += dst_step; \
1073  vst2q_s32 (p_dst, q2_out1); \
1074  p_dst += dst_step; \
1075  vst2q_s32 (p_dst, q2_out2); \
1076  p_dst += dst_step; \
1077  vst2q_s32 (p_dst, q2_out3); \
1078  p_dst += dst_step; \
1079  p_src = p_src - src_step * 4 + 8; \
1080  p_dst = p_dst - dst_step * 4 + 8; \
1081  p_tw = p_tw - tw_step * 2 + 8;
1082 
1083 #define RADIX4x4_WITH_TW_S1_FWD \
1084  q_s1_r = vsubq_s32 (q_s1_r, q_tmp0); \
1085  q_s1_i = vaddq_s32 (q_s1_i, q_tmp1); \
1086  q_s2_r = vsubq_s32 (q_s2_r, q_tmp2); \
1087  q_s2_i = vaddq_s32 (q_s2_i, q_tmp3); \
1088  q_s3_r = vsubq_s32 (q_s3_r, q_tmp4); \
1089  q_s3_i = vaddq_s32 (q_s3_i, q_tmp5);
1090 
1091 #define RADIX4x4_WITH_TW_S1_INV \
1092  q_s1_r = vaddq_s32 (q_s1_r, q_tmp0); \
1093  q_s1_i = vsubq_s32 (q_s1_i, q_tmp1); \
1094  q_s2_r = vaddq_s32 (q_s2_r, q_tmp2); \
1095  q_s2_i = vsubq_s32 (q_s2_i, q_tmp3); \
1096  q_s3_r = vaddq_s32 (q_s3_r, q_tmp4); \
1097  q_s3_i = vsubq_s32 (q_s3_i, q_tmp5);
1098 
1099 
1100 #define RADIX4x4_WITH_TW_LS_02 \
1101  q_s4_r = vaddq_s32 (q2_in0.val[0], q_s2_r); \
1102  q_s4_i = vaddq_s32 (q2_in0.val[1], q_s2_i); \
1103  q_s5_r = vsubq_s32 (q2_in0.val[0], q_s2_r); \
1104  q_s5_i = vsubq_s32 (q2_in0.val[1], q_s2_i); \
1105  q_s6_r = vaddq_s32 (q_s1_r, q_s3_r); \
1106  q_s6_i = vaddq_s32 (q_s1_i, q_s3_i); \
1107  q_s7_r = vsubq_s32 (q_s1_r, q_s3_r); \
1108  q_s7_i = vsubq_s32 (q_s1_i, q_s3_i); \
1109  q2_out2.val[0] = vsubq_s32 (q_s4_r, q_s6_r); \
1110  q2_out2.val[1] = vsubq_s32 (q_s4_i, q_s6_i); \
1111  q2_out0.val[0] = vaddq_s32 (q_s4_r, q_s6_r); \
1112  q2_out0.val[1] = vaddq_s32 (q_s4_i, q_s6_i);
1113 
1114 #define RADIX4x4_WITH_TW_LS_02_SCALED \
1115  q_s4_r = vhaddq_s32 (q2_in0.val[0], q_s2_r); \
1116  q_s4_i = vhaddq_s32 (q2_in0.val[1], q_s2_i); \
1117  q_s5_r = vhsubq_s32 (q2_in0.val[0], q_s2_r); \
1118  q_s5_i = vhsubq_s32 (q2_in0.val[1], q_s2_i); \
1119  q_s6_r = vhaddq_s32 (q_s1_r, q_s3_r); \
1120  q_s6_i = vhaddq_s32 (q_s1_i, q_s3_i); \
1121  q_s7_r = vhsubq_s32 (q_s1_r, q_s3_r); \
1122  q_s7_i = vhsubq_s32 (q_s1_i, q_s3_i); \
1123  q2_out2.val[0] = vhsubq_s32 (q_s4_r, q_s6_r); \
1124  q2_out2.val[1] = vhsubq_s32 (q_s4_i, q_s6_i); \
1125  q2_out0.val[0] = vhaddq_s32 (q_s4_r, q_s6_r); \
1126  q2_out0.val[1] = vhaddq_s32 (q_s4_i, q_s6_i);
1127 
1128 
1129 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1130  ne10_fft_cpx_int32_t * Fin,
1131  ne10_fft_cpx_int32_t * tw,
1132  ne10_int32_t src_stride,
1133  ne10_int32_t dst_stride,
1134  ne10_int32_t mstride)
1135 {
1136  RADIX4x4_WITH_TW_START
1137 
1138  for (m_count = 0; m_count < mstride; m_count += 4)
1139  {
1140  // load
1141  RADIX4x4_WITH_TW_LOAD
1142  RADIX4x4_WITH_TW_S1_FWD
1143 
1144  RADIX4x4_WITH_TW_LS_02
1145 
1146  q2_out1.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1147  q2_out1.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1148  q2_out3.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1149  q2_out3.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1150 
1151  // store
1152  RADIX4x4_WITH_TW_STORE
1153  }
1154 }
1155 
1156 
1157 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1158  ne10_fft_cpx_int32_t * Fin,
1159  ne10_fft_cpx_int32_t * tw,
1160  ne10_int32_t src_stride,
1161  ne10_int32_t dst_stride,
1162  ne10_int32_t mstride)
1163 {
1164  RADIX4x4_WITH_TW_START
1165 
1166  for (m_count = 0; m_count < mstride; m_count += 4)
1167  {
1168  // load
1169  RADIX4x4_WITH_TW_LOAD
1170  RADIX4x4_WITH_TW_S1_INV
1171 
1172  RADIX4x4_WITH_TW_LS_02
1173 
1174  q2_out1.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1175  q2_out1.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1176  q2_out3.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1177  q2_out3.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1178 
1179  // store
1180  RADIX4x4_WITH_TW_STORE
1181  }
1182 }
1183 
1184 
1185 
1186 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1187  ne10_fft_cpx_int32_t * Fin,
1188  ne10_fft_cpx_int32_t * tw,
1189  ne10_int32_t src_stride,
1190  ne10_int32_t dst_stride,
1191  ne10_int32_t mstride)
1192 {
1193  RADIX4x4_WITH_TW_START
1194 
1195  for (m_count = 0; m_count < mstride; m_count += 4)
1196  {
1197  // load
1198  RADIX4x4_WITH_TW_LOAD
1199  RADIX4x4_WITH_TW_S1_FWD
1200 
1201  RADIX4x4_WITH_TW_LS_02_SCALED
1202 
1203  q2_out1.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1204  q2_out1.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1205  q2_out3.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1206  q2_out3.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1207 
1208  // store
1209  RADIX4x4_WITH_TW_STORE
1210  }
1211 }
1212 
1213 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1214  ne10_fft_cpx_int32_t * Fin,
1215  ne10_fft_cpx_int32_t * tw,
1216  ne10_int32_t src_stride,
1217  ne10_int32_t dst_stride,
1218  ne10_int32_t mstride)
1219 {
1220  RADIX4x4_WITH_TW_START
1221 
1222  for (m_count = 0; m_count < mstride; m_count += 4)
1223  {
1224  // load
1225  RADIX4x4_WITH_TW_LOAD
1226  RADIX4x4_WITH_TW_S1_INV
1227 
1228  RADIX4x4_WITH_TW_LS_02_SCALED
1229 
1230  q2_out1.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1231  q2_out1.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1232  q2_out3.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1233  q2_out3.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1234 
1235  // store
1236  RADIX4x4_WITH_TW_STORE
1237  }
1238 }
1239 
1240 #define ne10_mixed_radix_fft_forward_int32_neon(scaled) \
1241 void ne10_mixed_radix_fft_forward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1242  ne10_fft_cpx_int32_t * Fin, \
1243  ne10_int32_t * factors, \
1244  ne10_fft_cpx_int32_t * twiddles, \
1245  ne10_fft_cpx_int32_t * buffer) \
1246 { \
1247  ne10_int32_t fstride, mstride, N; \
1248  ne10_int32_t fstride1; \
1249  ne10_int32_t f_count; \
1250  ne10_int32_t stage_count; \
1251  \
1252  ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1253  ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1254  ne10_fft_cpx_int32_t *Ftmp; \
1255  ne10_fft_cpx_int32_t *tw, *tw1; \
1256  \
1257  /* init fstride, mstride, N */ \
1258  stage_count = factors[0]; \
1259  fstride = factors[1]; \
1260  mstride = factors[ (stage_count << 1) - 1 ]; \
1261  N = factors[ stage_count << 1 ]; \
1262  \
1263  /* the first stage */ \
1264  Fin1 = Fin; \
1265  Fout1 = Fout; \
1266  if (N == 2) \
1267  { \
1268  N = fstride >> 1;\
1269  tw = twiddles; \
1270  fstride1 = fstride >> 2; \
1271  ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\
1272  \
1273  tw += 6; \
1274  mstride <<= 2; \
1275  fstride >>= 4; \
1276  stage_count -= 2; \
1277  \
1278  Ftmp = buffer; \
1279  buffer = Fout; \
1280  Fout = Ftmp; \
1281  } \
1282  else if (N == 4) \
1283  { \
1284  ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1285  N = fstride; \
1286  Ftmp = buffer; \
1287  buffer = Fout; \
1288  Fout = Ftmp; \
1289  /* update address for other stages*/ \
1290  stage_count--; \
1291  tw = twiddles; \
1292  fstride >>= 2; \
1293  } \
1294  /* others but the last one*/ \
1295  for (; stage_count > 1 ; stage_count--) \
1296  { \
1297  Fin1 = buffer; \
1298  for (f_count = 0; f_count < fstride; f_count ++) \
1299  { \
1300  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1301  tw1 = tw; \
1302  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1303  Fin1 += mstride; \
1304  } \
1305  tw += mstride * 3; \
1306  mstride <<= 2; \
1307  Ftmp = buffer; \
1308  buffer = Fout; \
1309  Fout = Ftmp; \
1310  fstride >>= 2; \
1311  }\
1312  /* the last one*/ \
1313  if (stage_count) \
1314  { \
1315  Fin1 = buffer; \
1316  Fout1 = Fout_ls; \
1317  for (f_count = 0; f_count < fstride; f_count ++) \
1318  { \
1319  tw1 = tw; \
1320  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1321  Fin1 += mstride; \
1322  Fout1 += mstride; \
1323  } \
1324  } \
1325 }
1326 
1327 #define ne10_mixed_radix_fft_backward_int32_neon(scaled) \
1328 void ne10_mixed_radix_fft_backward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1329  ne10_fft_cpx_int32_t * Fin, \
1330  ne10_int32_t * factors, \
1331  ne10_fft_cpx_int32_t * twiddles, \
1332  ne10_fft_cpx_int32_t * buffer) \
1333 { \
1334  ne10_int32_t fstride, mstride, N; \
1335  ne10_int32_t fstride1; \
1336  ne10_int32_t f_count; \
1337  ne10_int32_t stage_count; \
1338  \
1339  ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1340  ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1341  ne10_fft_cpx_int32_t *Ftmp; \
1342  ne10_fft_cpx_int32_t *tw, *tw1; \
1343  \
1344  /* init fstride, mstride, N */ \
1345  stage_count = factors[0]; \
1346  fstride = factors[1]; \
1347  mstride = factors[ (stage_count << 1) - 1 ]; \
1348  N = factors[ stage_count << 1 ]; \
1349  \
1350  /* the first stage */ \
1351  Fin1 = Fin; \
1352  Fout1 = Fout; \
1353  if (N == 2) \
1354  { \
1355  N = fstride >> 1;\
1356  tw = twiddles; \
1357  fstride1 = fstride >> 2; \
1358  ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\
1359  \
1360  tw += 6; \
1361  mstride <<= 2; \
1362  fstride >>= 4; \
1363  stage_count -= 2; \
1364  \
1365  Ftmp = buffer; \
1366  buffer = Fout; \
1367  Fout = Ftmp; \
1368  } \
1369  else if (N == 4) \
1370  { \
1371  ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1372  N = fstride; \
1373  Ftmp = buffer; \
1374  buffer = Fout; \
1375  Fout = Ftmp; \
1376  /* update address for other stages*/ \
1377  stage_count--; \
1378  tw = twiddles; \
1379  fstride >>= 2; \
1380  } \
1381  /* others but the last one*/ \
1382  for (; stage_count > 1 ; stage_count--) \
1383  { \
1384  Fin1 = buffer; \
1385  for (f_count = 0; f_count < fstride; f_count ++) \
1386  { \
1387  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1388  tw1 = tw; \
1389  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1390  Fin1 += mstride; \
1391  } \
1392  tw += mstride * 3; \
1393  mstride <<= 2; \
1394  Ftmp = buffer; \
1395  buffer = Fout; \
1396  Fout = Ftmp; \
1397  fstride >>= 2; \
1398  }\
1399  /* the last one*/ \
1400  if (stage_count) \
1401  { \
1402  Fin1 = buffer; \
1403  Fout1 = Fout_ls; \
1404  for (f_count = 0; f_count < fstride; f_count ++) \
1405  { \
1406  tw1 = tw; \
1407  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1408  Fin1 += mstride; \
1409  Fout1 += mstride; \
1410  } \
1411  } \
1412 }
1413 
1414 ne10_mixed_radix_fft_forward_int32_neon (unscaled)
1415 ne10_mixed_radix_fft_forward_int32_neon (scaled)
1416 ne10_mixed_radix_fft_backward_int32_neon (unscaled)
1417 ne10_mixed_radix_fft_backward_int32_neon (scaled)
1418 
1419 
1420 static void ne10_fft_split_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1421  const ne10_fft_cpx_int32_t *src,
1422  ne10_fft_cpx_int32_t *twiddles,
1423  ne10_int32_t ncfft,
1424  ne10_int32_t scaled_flag)
1425 {
1426  ne10_int32_t k;
1427  ne10_int32_t count = ncfft / 2;
1428  ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
1429  int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1430  int32x4_t q_fpnk_r, q_fpnk_i;
1431  int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1432  int32x4_t q_tw_r, q_tw_i;
1433  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1434  int32x4_t q_dst2_r, q_dst2_i;
1435  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1436 
1437  tdc.r = src[0].r;
1438  tdc.i = src[0].i;
1439 
1440  if (scaled_flag)
1441  NE10_F2I32_FIXDIV (tdc, 2);
1442 
1443  dst[0].r = tdc.r + tdc.i;
1444  dst[ncfft].r = tdc.r - tdc.i;
1445  dst[ncfft].i = dst[0].i = 0;
1446  if (count >= 4)
1447  {
1448 
1449  if (scaled_flag)
1450  {
1451  for (k = 1; k <= count ; k += 4)
1452  {
1453  p_src = (int32_t*) (& (src[k]));
1454  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1455  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1456  p_dst = (int32_t*) (& (dst[k]));
1457  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1458 
1459  q2_fpk = vld2q_s32 (p_src);
1460  q2_fpnk = vld2q_s32 (p_src2);
1461 
1462  q2_tw = vld2q_s32 (p_twiddles);
1463  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1464  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1465  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1466  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1467  q_fpnk_i = vnegq_s32 (q_fpnk_i);
1468 
1469  q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1470  q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1471 
1472  q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1473  q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1474 
1475  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1476  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1477  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1478  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1479  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1480  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1481 
1482  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1483  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1484  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1485  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1486  q_dst2_r = vrev64q_s32 (q_dst2_r);
1487  q_dst2_i = vrev64q_s32 (q_dst2_i);
1488  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1489  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1490  vst2q_s32 (p_dst, q2_dst);
1491  vst2q_s32 (p_dst2, q2_dst2);
1492 
1493  }
1494  }
1495  else
1496  {
1497  for (k = 1; k <= count ; k += 4)
1498  {
1499  p_src = (int32_t*) (& (src[k]));
1500  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1501  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1502  p_dst = (int32_t*) (& (dst[k]));
1503  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1504 
1505  q2_fpk = vld2q_s32 (p_src);
1506  q2_fpnk = vld2q_s32 (p_src2);
1507 
1508  q2_tw = vld2q_s32 (p_twiddles);
1509  q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1510  q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1511  q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1512  q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1513  q_fpnk_i = vnegq_s32 (q_fpnk_i);
1514 
1515  q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1516  q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1517 
1518  q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1519  q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1520 
1521  q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1522  q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1523  q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1524  q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1525  q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1526  q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1527 
1528  q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1529  q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1530  q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1531  q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1532  q_dst2_r = vrev64q_s32 (q_dst2_r);
1533  q_dst2_i = vrev64q_s32 (q_dst2_i);
1534  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1535  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1536  vst2q_s32 (p_dst, q2_dst);
1537  vst2q_s32 (p_dst2, q2_dst2);
1538 
1539  }
1540  }
1541  }
1542  else
1543  {
1544 
1545  for (k = 1; k <= ncfft / 2 ; ++k)
1546  {
1547  fpk = src[k];
1548  fpnk.r = src[ncfft - k].r;
1549  fpnk.i = - src[ncfft - k].i;
1550  if (scaled_flag)
1551  {
1552  NE10_F2I32_FIXDIV (fpk, 2);
1553  NE10_F2I32_FIXDIV (fpnk, 2);
1554  }
1555 
1556  f1k.r = fpk.r + fpnk.r;
1557  f1k.i = fpk.i + fpnk.i;
1558 
1559  f2k.r = fpk.r - fpnk.r;
1560  f2k.i = fpk.i - fpnk.i;
1561 
1562  tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
1563  tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
1564 
1565  dst[k].r = (f1k.r + tw.r) >> 1;
1566  dst[k].i = (f1k.i + tw.i) >> 1;
1567  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1568  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1569  }
1570  }
1571 }
1572 
1573 static void ne10_fft_split_c2r_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1574  const ne10_fft_cpx_int32_t *src,
1575  ne10_fft_cpx_int32_t *twiddles,
1576  ne10_int32_t ncfft,
1577  ne10_int32_t scaled_flag)
1578 {
1579 
1580  ne10_int32_t k;
1581  ne10_int32_t count = ncfft / 2;
1582  ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1583  int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1584  int32x4_t q_fnkc_r, q_fnkc_i;
1585  int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1586  int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1587  int32x4_t q_dst2_r, q_dst2_i;
1588  int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1589 
1590 
1591  dst[0].r = src[0].r + src[ncfft].r;
1592  dst[0].i = src[0].r - src[ncfft].r;
1593  if (scaled_flag)
1594  NE10_F2I32_FIXDIV (dst[0], 2);
1595  if (count >= 4)
1596  {
1597  if (scaled_flag)
1598  {
1599  for (k = 1; k <= count ; k += 4)
1600  {
1601  p_src = (int32_t*) (& (src[k]));
1602  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1603  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1604  p_dst = (int32_t*) (& (dst[k]));
1605  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1606 
1607  q2_fk = vld2q_s32 (p_src);
1608  q2_fnkc = vld2q_s32 (p_src2);
1609  q2_tw = vld2q_s32 (p_twiddles);
1610  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1611  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1612  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1613  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1614  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1615 
1616  q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1617  q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1618  q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1619  q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1620 
1621  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1622  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1623  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1624  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1625  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1626  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1627 
1628  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1629  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1630  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1631  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1632  q_dst2_r = vrev64q_s32 (q_dst2_r);
1633  q_dst2_i = vrev64q_s32 (q_dst2_i);
1634  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1635  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1636  vst2q_s32 (p_dst, q2_dst);
1637  vst2q_s32 (p_dst2, q2_dst2);
1638 
1639  }
1640 
1641  }
1642  else
1643  {
1644  for (k = 1; k <= count ; k += 4)
1645  {
1646  p_src = (int32_t*) (& (src[k]));
1647  p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1648  p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1649  p_dst = (int32_t*) (& (dst[k]));
1650  p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1651 
1652  q2_fk = vld2q_s32 (p_src);
1653  q2_fnkc = vld2q_s32 (p_src2);
1654  q2_tw = vld2q_s32 (p_twiddles);
1655  q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1656  q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1657  q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1658  q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1659  q_fnkc_i = vnegq_s32 (q_fnkc_i);
1660 
1661  q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1662  q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1663  q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1664  q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1665 
1666  q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1667  q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1668  q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1669  q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1670  q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1671  q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1672 
1673  q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1674  q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1675  q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1676  q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1677  q_dst2_r = vrev64q_s32 (q_dst2_r);
1678  q_dst2_i = vrev64q_s32 (q_dst2_i);
1679  q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1680  q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1681  vst2q_s32 (p_dst, q2_dst);
1682  vst2q_s32 (p_dst2, q2_dst2);
1683 
1684  }
1685  }
1686  }
1687  else
1688  {
1689 
1690  for (k = 1; k <= ncfft / 2; k++)
1691  {
1692  fk = src[k];
1693  fnkc.r = src[ncfft - k].r;
1694  fnkc.i = -src[ncfft - k].i;
1695  if (scaled_flag)
1696  {
1697  NE10_F2I32_FIXDIV (fk, 2);
1698  NE10_F2I32_FIXDIV (fnkc, 2);
1699  }
1700 
1701  fek.r = fk.r + fnkc.r;
1702  fek.i = fk.i + fnkc.i;
1703 
1704  tmp.r = fk.r - fnkc.r;
1705  tmp.i = fk.i - fnkc.i;
1706 
1707  fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1708  fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1709 
1710  dst[k].r = fek.r + fok.r;
1711  dst[k].i = fek.i + fok.i;
1712 
1713  dst[ncfft - k].r = fek.r - fok.r;
1714  dst[ncfft - k].i = fok.i - fek.i;
1715  }
1716  }
1717 }
1718 
1719 
1740  ne10_fft_cpx_int32_t *fin,
1742  ne10_int32_t inverse_fft,
1743  ne10_int32_t scaled_flag)
1744 {
1745  // For input shorter than 16, fall back to c version.
1746  // We would not get much improvement from NEON for these cases.
1747  if (cfg->nfft < 16)
1748  {
1749  ne10_fft_c2c_1d_int32_c (fout, fin, cfg, inverse_fft, scaled_flag);
1750  return;
1751  }
1752 
1753  ne10_int32_t stage_count = cfg->factors[0];
1754  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1755 
1756  assert ((algorithm_flag == NE10_FFT_ALG_24)
1757  || (algorithm_flag == NE10_FFT_ALG_ANY));
1758 
1759  // For NE10_FFT_ALG_ANY.
1760  // Function will return inside this branch.
1761  if (algorithm_flag == NE10_FFT_ALG_ANY)
1762  {
1763  if (inverse_fft)
1764  {
1765  ne10_mixed_radix_generic_butterfly_inverse_int32_neon (fout, fin,
1766  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1767  }
1768  else
1769  {
1770  ne10_mixed_radix_generic_butterfly_int32_neon (fout, fin,
1771  cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1772  }
1773  return;
1774  }
1775 
1776  if (scaled_flag)
1777  {
1778  if (inverse_fft)
1779  {
1780  switch (cfg->nfft)
1781  {
1782  case 4:
1783  ne10_fft4_backward_int32_scaled (fout, fin);
1784  break;
1785  case 8:
1786  ne10_fft8_backward_int32_scaled (fout, fin);
1787  break;
1788  case 16:
1789  ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->twiddles);
1790  break;
1791  default:
1792  ne10_mixed_radix_fft_backward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1793  break;
1794  }
1795  }
1796  else
1797  {
1798  switch (cfg->nfft)
1799  {
1800  case 4:
1801  ne10_fft4_forward_int32_scaled (fout, fin);
1802  break;
1803  case 8:
1804  ne10_fft8_forward_int32_scaled (fout, fin);
1805  break;
1806  case 16:
1807  ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->twiddles);
1808  break;
1809  default:
1810  ne10_mixed_radix_fft_forward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1811  break;
1812  }
1813  }
1814  }
1815  else
1816  {
1817  if (inverse_fft)
1818  {
1819  switch (cfg->nfft)
1820  {
1821  case 4:
1822  ne10_fft4_backward_int32_unscaled (fout, fin);
1823  break;
1824  case 8:
1825  ne10_fft8_backward_int32_unscaled (fout, fin);
1826  break;
1827  case 16:
1828  ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1829  break;
1830  default:
1831  ne10_mixed_radix_fft_backward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1832  break;
1833  }
1834  }
1835  else
1836  {
1837  switch (cfg->nfft)
1838  {
1839  case 4:
1840  ne10_fft4_forward_int32_unscaled (fout, fin);
1841  break;
1842  case 8:
1843  ne10_fft8_forward_int32_unscaled (fout, fin);
1844  break;
1845  case 16:
1846  ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1847  break;
1848  default:
1849  ne10_mixed_radix_fft_forward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1850  break;
1851  }
1852  }
1853  }
1854 }
1855  //end of C2C_FFT_IFFT group
1859 
1877  ne10_int32_t *fin,
1879  ne10_int32_t scaled_flag)
1880 {
1881  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1882  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1883  ne10_fft_state_int32_t c2c_state;
1884 
1885  c2c_state.nfft = cfg->ncfft;
1886  c2c_state.factors = cfg->factors;
1887  c2c_state.twiddles = cfg->twiddles;
1888  c2c_state.buffer = tmpbuf2;
1889 
1890  ne10_fft_c2c_1d_int32_neon (tmpbuf1, (ne10_fft_cpx_int32_t*) fin, &c2c_state, 0, scaled_flag);
1891  ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1892 }
1893 
1905 void ne10_fft_c2r_1d_int32_neon (ne10_int32_t *fout,
1906  ne10_fft_cpx_int32_t *fin,
1908  ne10_int32_t scaled_flag)
1909 {
1910  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1911  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1912  ne10_fft_state_int32_t c2c_state;
1913 
1914  c2c_state.nfft = cfg->ncfft;
1915  c2c_state.factors = cfg->factors;
1916  c2c_state.twiddles = cfg->twiddles;
1917  c2c_state.buffer = tmpbuf2;
1918 
1919  ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1920  ne10_fft_c2c_1d_int32_neon ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1921 }
1922 
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int32 data.
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int32 data.
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
structure for the 32 bits fixed point FFT function.
Definition: NE10_types.h:328