Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_boxfilter.neon.c
1 /*
2  * Copyright 2013-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : imgproc/NE10_boxfilter.c
30  */
31 
32 #include "NE10.h"
33 #include <stdlib.h>
34 #include <math.h>
35 #include <arm_neon.h>
40 extern void ne10_img_boxfilter_row_border (const ne10_uint8_t* src,
41  ne10_uint8_t* dst,
42  ne10_size_t src_sz,
43  ne10_int32_t src_stride,
44  ne10_int32_t dst_stride,
45  ne10_size_t kernel,
46  ne10_point_t anchor,
47  ne10_int32_t *border_l_ptr,
48  ne10_int32_t *border_r_ptr);
49 
50 extern void ne10_img_boxfilter_col_border (const ne10_uint8_t *src,
51  ne10_uint8_t *dst,
52  ne10_size_t src_sz,
53  ne10_int32_t src_stride,
54  ne10_int32_t dst_stride,
55  ne10_size_t kernel,
56  ne10_point_t anchor,
57  ne10_int32_t *border_t_ptr,
58  ne10_int32_t *border_b_ptr);
59 
60 extern void ne10_img_boxfilter_row_c (const ne10_uint8_t *src,
61  ne10_uint8_t *dst,
62  ne10_size_t src_sz,
63  ne10_int32_t src_stride,
64  ne10_int32_t dst_stride,
65  ne10_size_t kernel,
66  ne10_point_t anchor,
67  ne10_int32_t border_l,
68  ne10_int32_t border_r);
69 
70 extern void ne10_img_boxfilter_col_c (const ne10_uint8_t *src,
71  ne10_uint8_t *dst,
72  ne10_size_t src_sz,
73  ne10_int32_t src_stride,
74  ne10_int32_t dst_stride,
75  ne10_size_t kernel,
76  ne10_point_t anchor,
77  ne10_int32_t border_t,
78  ne10_int32_t border_b);
79 
80 /* RGBA CHANNEL number is 4 */
81 #define RGBA_CH 4
82 /* DIV_SHIFT is used in replacement of constant division */
83 #define DIV_SHIFT 15
84 
85 void ne10_img_boxfilter_row_neon (const ne10_uint8_t *src,
86  ne10_uint8_t *dst,
87  ne10_size_t src_sz,
88  ne10_int32_t src_stride,
89  ne10_int32_t dst_stride,
90  ne10_size_t kernel,
91  ne10_point_t anchor,
92  ne10_int32_t border_l,
93  ne10_int32_t border_r)
94 {
95  /* when in special cases, we'll call the c version of row filter */
96  if (src_sz.y == 1 || kernel.x >= (1 << 7) || kernel.x == 1)
97  {
98  return ne10_img_boxfilter_row_c (src,
99  dst,
100  src_sz,
101  src_stride,
102  dst_stride,
103  kernel,
104  anchor,
105  border_l,
106  border_r);
107  }
108 
109  assert (src != dst);
110  assert (kernel.x > 0);
111  assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
112  /* if kernel.x was 1, the mul variable would overflow when it
113  * is casted to ne10_int16_t type.
114  */
115  assert ((src_sz.y > 1) &&
116  (kernel.x < (1 << 7)) &&
117  (kernel.x > 1));
118 
119  ne10_int32_t x, y, k;
120  ne10_int16_t mul = (1 << DIV_SHIFT) / kernel.x;
121  int16x8_t mul_vec = vdupq_n_s16 (mul);
122 
123  for (y = 0; y < src_sz.y; y += 2)
124  {
125  /* step back one row when image height is odd and before reaching last
126  * line
127  */
128  if ((src_sz.y % 2 != 0) && (y == src_sz.y - 1))
129  y--;
130 
131  const ne10_uint8_t *src_row1 = src + y * src_stride;
132  const ne10_uint8_t *src_row2 = src + (y + 1) * src_stride;
133  ne10_uint8_t *dst_row1 = dst + y * dst_stride;
134  ne10_uint8_t *dst_row2 = dst + (y + 1) * dst_stride;
135  ne10_int16_t sum[RGBA_CH * 2];
136 
137  for (k = 0; k < RGBA_CH; k++)
138  {
139  sum[k] = 0;
140  sum[k + 4] = 0;
141 
142  for (x = 0; x < kernel.x; x++)
143  {
144  sum[k] += * (src_row1 + x * RGBA_CH + k);
145  sum[k + 4] += * (src_row2 + x * RGBA_CH + k);
146  }
147 
148  *(dst_row1 + border_l * RGBA_CH + k) = sum[k] * mul >>
149  DIV_SHIFT;
150  *(dst_row2 + border_l * RGBA_CH + k) = sum[k + 4] * mul >>
151  DIV_SHIFT;
152  }
153 
154  ne10_uint32_t prev = (anchor.x + 1) * RGBA_CH;
155  ne10_uint32_t next = (kernel.x - anchor.x - 1) * RGBA_CH;
156  const ne10_uint8_t *src_pixel1 = src_row1 + (1 + border_l) * RGBA_CH;
157  const ne10_uint8_t *src_pixel2 = src_row2 + (1 + border_l) * RGBA_CH;
158  const ne10_uint8_t *src_pixel_end = src_row1 + (src_sz.x - border_r) *
159  RGBA_CH;
160  ne10_uint8_t *dst_pixel1 = dst_row1 + (1 + border_l) * RGBA_CH;
161  ne10_uint8_t *dst_pixel2 = dst_row2 + (1 + border_l) * RGBA_CH;
162 
163  int16x8_t sum_vec = vld1q_s16 (sum);
164  int16x8_t sum_tmp;
165  uint16x8_t sum_vec_u;
166  uint8x8_t src_pixel_next_vec, src_pixel_prev_vec;
167  uint32x2_t src_pixel_next_tmp_vec, src_pixel_prev_tmp_vec;
168  uint32x2_t src_pixel_next_tmp_vec_pre, src_pixel_prev_tmp_vec_pre;
169  uint32x2_t dst_pixel_vec;
170  uint8x8_t dst_pixel_tmp_vec;
171 
172 
173  /* preload */
174  src_pixel_next_tmp_vec = vld1_lane_u32 (
175  (const ne10_uint32_t*) (src_pixel1 + next),
176  src_pixel_next_tmp_vec,
177  0);
178  src_pixel_prev_tmp_vec = vld1_lane_u32 (
179  (const ne10_uint32_t*) (src_pixel1 - prev),
180  src_pixel_prev_tmp_vec,
181  0);
182  src_pixel_next_tmp_vec = vld1_lane_u32 (
183  (const ne10_uint32_t*) (src_pixel2 + next),
184  src_pixel_next_tmp_vec,
185  1);
186  src_pixel_prev_tmp_vec = vld1_lane_u32 (
187  (const ne10_uint32_t*) (src_pixel2 - prev),
188  src_pixel_prev_tmp_vec,
189  1);
190 
191  /* load two rows to do filtering */
192  while (src_pixel1 < src_pixel_end)
193  {
194  /* preload */
195  src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
196  (const ne10_uint32_t*) (src_pixel1 + 4 + next),
197  src_pixel_next_tmp_vec_pre,
198  0);
199  src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
200  (const ne10_uint32_t*) (src_pixel1 + 4 - prev),
201  src_pixel_prev_tmp_vec_pre,
202  0);
203  src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
204  (const ne10_uint32_t*) (src_pixel2 + 4 + next),
205  src_pixel_next_tmp_vec_pre,
206  1);
207  src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
208  (const ne10_uint32_t*) (src_pixel2 + 4 - prev),
209  src_pixel_prev_tmp_vec_pre,
210  1);
211 
212  src_pixel_prev_vec = vreinterpret_u8_u32 (src_pixel_prev_tmp_vec);
213  src_pixel_next_vec = vreinterpret_u8_u32 (src_pixel_next_tmp_vec);
214 
215  sum_vec_u = vreinterpretq_u16_s16 (sum_vec);
216  sum_vec_u = vaddw_u8 (sum_vec_u, src_pixel_next_vec);
217  sum_vec_u = vsubw_u8 (sum_vec_u, src_pixel_prev_vec);
218  sum_vec = vreinterpretq_s16_u16 (sum_vec_u);
219  /* vqdmulhq_n_s16 would shift the result 16 bit */
220  sum_tmp = vqdmulhq_s16 (sum_vec, mul_vec);
221  dst_pixel_tmp_vec = vqmovun_s16 (sum_tmp);
222  dst_pixel_vec = vreinterpret_u32_u8 (dst_pixel_tmp_vec);
223  vst1_lane_u32 ((ne10_uint32_t *) dst_pixel1, dst_pixel_vec, 0);
224  vst1_lane_u32 ((ne10_uint32_t *) dst_pixel2, dst_pixel_vec, 1);
225 
226  src_pixel_prev_tmp_vec = src_pixel_prev_tmp_vec_pre;
227  src_pixel_next_tmp_vec = src_pixel_next_tmp_vec_pre;
228 
229  src_pixel1 += 4;
230  src_pixel2 += 4;
231  dst_pixel1 += 4;
232  dst_pixel2 += 4;
233  }
234  }
235 }
236 
237 void ne10_img_boxfilter_col_neon (const ne10_uint8_t *src,
238  ne10_uint8_t *dst,
239  ne10_size_t src_sz,
240  ne10_int32_t src_stride,
241  ne10_int32_t dst_stride,
242  ne10_size_t kernel,
243  ne10_point_t anchor,
244  ne10_int32_t border_t,
245  ne10_int32_t border_b)
246 {
247  /* when in special cases, we'll call c version to do the work */
248  if (kernel.y == 1 || kernel.y >= (1 << 7) || src_sz.x == 1)
249  {
250  return ne10_img_boxfilter_col_c (src,
251  dst,
252  src_sz,
253  src_stride,
254  dst_stride,
255  kernel,
256  anchor,
257  border_t,
258  border_b);
259  }
260 
261  assert (src != dst);
262  assert (kernel.y > 0);
263  assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
264  /* if kernel.y was 1, the mul variable would overflow when it
265  * is casted to ne10_int16_t type.
266  */
267  assert ( (src_sz.x > 1) &&
268  (kernel.y < (1 << 7)) &&
269  (kernel.y > 1));
270 
271  ne10_int32_t x, y, k;
272  ne10_uint16_t *sum_row = (ne10_uint16_t *) malloc (src_sz.x *
273  RGBA_CH *
274  sizeof (ne10_uint16_t));
275  ne10_uint16_t mul = (1 << DIV_SHIFT) / kernel.y;
276 
277  if (!sum_row)
278  {
279  fprintf (stderr,
280  "ERROR: buffer allocation fails!\nallocation size: %d\n",
281  sizeof (ne10_uint32_t) *
282  src_sz.x *
283  RGBA_CH);
284  return;
285  }
286 
287  for (x = 0; x < src_sz.x * RGBA_CH; x++)
288  {
289  sum_row[x] = 0;
290  }
291 
292  for (x = 0; x < src_sz.x; x++)
293  {
294  const ne10_uint8_t *src_col = src + x * RGBA_CH;
295  ne10_uint8_t *dst_col = dst + x * RGBA_CH;
296  ne10_uint8_t *dst_pixel = dst_col + border_t * dst_stride;
297  ne10_uint16_t *sum = sum_row + x * RGBA_CH;
298 
299  for (y = 0; y < kernel.y; y++)
300  {
301  const ne10_uint8_t *src_pixel = src_col + y * src_stride;
302 
303  for (k = 0; k < RGBA_CH; k++)
304  {
305  sum[k] += src_pixel[k];
306  }
307  }
308 
309  for (k = 0; k < RGBA_CH; k++)
310  {
311  dst_pixel[k] = sum_row[x * RGBA_CH + k] * mul >>
312  DIV_SHIFT;
313  }
314  }
315 
316  const ne10_uint8_t *src_row = src + (1 + border_t) * src_stride;
317  const ne10_uint8_t *src_row_end = src + (src_sz.y - border_b) *
318  src_stride;
319  ne10_uint8_t *dst_row = dst + (1 + border_t) * dst_stride;
320  ne10_uint32_t prev = (anchor.y + 1) * src_stride;
321  ne10_uint32_t next = (kernel.y - anchor.y - 1) * src_stride;
322 
323  uint16x8_t sum_vec, sum_vec_pre;
324  int16x8_t sum_vec_s;
325  uint8x8_t src_pixel_prev_vec, src_pixel_next_vec;
326  uint8x8_t src_pixel_prev_vec_pre, src_pixel_next_vec_pre;
327  uint8x8_t dst_pixel_vec;
328 
329  ne10_uint16_t sum_val_bakcup[RGBA_CH];
330  ne10_uint32_t src_sz_x_adjust = src_sz.x;
331  int16x8_t mul_vec = vdupq_n_s16 (mul);
332 
333  if (src_sz.x % 2 != 0)
334  {
335  for (k = 0; k < RGBA_CH; k++)
336  sum_val_bakcup[k] = sum_row[ (src_sz.x - 2) * RGBA_CH + k];
337  src_sz_x_adjust = src_sz.x - 1;
338  }
339 
340  /* boxfilter column filter is done once in a row, which
341  * is more friendly to cache than once in a column.
342  */
343  while (src_row < src_row_end)
344  {
345  /* preload */
346  const ne10_uint8_t *src_pixel = src_row;
347  ne10_uint8_t *dst_pixel = dst_row;
348  src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
349  src_pixel_next_vec = vld1_u8 (src_pixel + next);
350  ne10_uint16_t *sum, *sum_pre;
351  sum_vec = vld1q_u16 (sum_row);
352 
353  for (x = 0; x < src_sz_x_adjust; x += 2)
354  {
355  sum = sum_row + x * RGBA_CH;
356  sum_pre = sum + 2 * RGBA_CH;
357  sum_vec_pre = vld1q_u16 (sum_pre);
358  /* preload */
359  src_pixel = src_row + (x + 2) * RGBA_CH;
360  src_pixel_prev_vec_pre = vld1_u8 (src_pixel - prev);
361  src_pixel_next_vec_pre = vld1_u8 (src_pixel + next);
362 
363  sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
364  sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
365  sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
366  /* vqdmulhq_n_s16 would shift the result 16 bit */
367  sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
368  dst_pixel_vec = vqmovun_s16 (sum_vec_s);
369  dst_pixel = dst_row + x * RGBA_CH;
370  vst1_u8 (dst_pixel, dst_pixel_vec);
371  vst1q_u16 (sum, sum_vec);
372 
373  src_pixel_next_vec = src_pixel_next_vec_pre;
374  src_pixel_prev_vec = src_pixel_prev_vec_pre;
375  sum_vec = sum_vec_pre;
376  }
377  src_row += src_stride;
378  dst_row += dst_stride;
379  }
380 
381  if (src_sz.x % 2 != 0)
382  {
383  for (k = 0; k < RGBA_CH; k++)
384  sum_row[ (src_sz.x - 2) * RGBA_CH + k] = sum_val_bakcup[k];
385 
386  src_row = src + (1 + border_t) * src_stride;
387  dst_row = dst + (1 + border_t) * dst_stride;
388  /* step back one column */
389  x = src_sz.x - 2;
390  sum_vec = vld1q_u16 (sum_row + x * RGBA_CH);
391 
392  while (src_row < src_row_end)
393  {
394  const ne10_uint8_t *src_pixel = src_row + x * RGBA_CH;
395  ne10_uint8_t *dst_pixel = dst_row + x * RGBA_CH;
396  src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
397  src_pixel_next_vec = vld1_u8 (src_pixel + next);
398  sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
399  sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
400  sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
401  /* vqdmulhq_n_s16 would shift the result 16 bit
402  */
403  sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
404  dst_pixel_vec = vqmovun_s16 (sum_vec_s);
405  vst1_u8 (dst_pixel, dst_pixel_vec);
406 
407  src_row += src_stride;
408  dst_row += dst_stride;
409  }
410  }
411 
412  free (sum_row);
413 }
414 
434 void ne10_img_boxfilter_rgba8888_neon (const ne10_uint8_t *src,
435  ne10_uint8_t *dst,
436  ne10_size_t src_sz,
437  ne10_int32_t src_stride,
438  ne10_int32_t dst_stride,
439  ne10_size_t kernel)
440 {
441  assert (src != 0 && dst != 0);
442  assert (src_sz.x > 0 && src_sz.y > 0);
443  assert (src_stride > 0 && dst_stride > 0);
444  assert (kernel.x > 0 && kernel.x <= src_sz.x
445  && kernel.y > 0 && kernel.y <= src_sz.y);
446 
447  ne10_int32_t border_l, border_r, border_t, border_b;
448  ne10_point_t anchor;
449 
450  anchor.x = kernel.x / 2;
451  anchor.y = kernel.y / 2;
452 
453  /* the extra 2 elements here is reserved for pre-load when do row or column
454  * filter */
455  ne10_uint32_t mem_bytes = (sizeof (ne10_uint8_t) * src_sz.x * src_sz.y + 2)
456  * RGBA_CH;
457 
458  ne10_uint8_t *dst_buf = (ne10_uint8_t *) malloc (mem_bytes);
459 
460  if (!dst_buf)
461  {
462  fprintf (stderr,
463  "ERROR: buffer allocation fails!\nallocation size: %d\n",
464  mem_bytes);
465  return;
466  }
467 
468  ne10_int32_t dst_buf_stride = src_sz.x * RGBA_CH;
469 
470  /* compute the row border of dst image */
471  ne10_img_boxfilter_row_border (src,
472  dst_buf,
473  src_sz,
474  src_stride,
475  dst_buf_stride,
476  kernel,
477  anchor,
478  &border_l,
479  &border_r);
480  /* boxfilter is separable filter, and then can be apply row filter and
481  * column filter sequentially. here apply boxfilter's row part to image
482  */
483  ne10_img_boxfilter_row_neon (src,
484  dst_buf,
485  src_sz,
486  src_stride,
487  dst_buf_stride,
488  kernel,
489  anchor,
490  border_l,
491  border_r);
492 
493  /* compute the column border of dst image,
494  * which is based on previous row filter result.
495  */
496  ne10_img_boxfilter_col_border (dst_buf,
497  dst,
498  src_sz,
499  dst_buf_stride,
500  dst_stride,
501  kernel,
502  anchor,
503  &border_t,
504  &border_b);
505 
506  /* apply boxfilter column filter to image */
507  ne10_img_boxfilter_col_neon (dst_buf,
508  dst,
509  src_sz,
510  dst_buf_stride,
511  dst_stride,
512  kernel,
513  anchor,
514  border_t,
515  border_b);
516 
517  free (dst_buf);
518 }
void ne10_img_boxfilter_rgba8888_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel)
neon optimized box filter
Structure for point in image.
Definition: NE10_types.h:434