40 extern void ne10_img_boxfilter_row_border (
const ne10_uint8_t* src,
43 ne10_int32_t src_stride,
44 ne10_int32_t dst_stride,
47 ne10_int32_t *border_l_ptr,
48 ne10_int32_t *border_r_ptr);
50 extern void ne10_img_boxfilter_col_border (
const ne10_uint8_t *src,
53 ne10_int32_t src_stride,
54 ne10_int32_t dst_stride,
57 ne10_int32_t *border_t_ptr,
58 ne10_int32_t *border_b_ptr);
60 extern void ne10_img_boxfilter_row_c (
const ne10_uint8_t *src,
63 ne10_int32_t src_stride,
64 ne10_int32_t dst_stride,
67 ne10_int32_t border_l,
68 ne10_int32_t border_r);
70 extern void ne10_img_boxfilter_col_c (
const ne10_uint8_t *src,
73 ne10_int32_t src_stride,
74 ne10_int32_t dst_stride,
77 ne10_int32_t border_t,
78 ne10_int32_t border_b);
85 void ne10_img_boxfilter_row_neon (
const ne10_uint8_t *src,
88 ne10_int32_t src_stride,
89 ne10_int32_t dst_stride,
92 ne10_int32_t border_l,
93 ne10_int32_t border_r)
96 if (src_sz.y == 1 || kernel.x >= (1 << 7) || kernel.x == 1)
98 return ne10_img_boxfilter_row_c (src,
110 assert (kernel.x > 0);
111 assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
115 assert ((src_sz.y > 1) &&
116 (kernel.x < (1 << 7)) &&
119 ne10_int32_t x, y, k;
120 ne10_int16_t mul = (1 << DIV_SHIFT) / kernel.x;
121 int16x8_t mul_vec = vdupq_n_s16 (mul);
123 for (y = 0; y < src_sz.y; y += 2)
128 if ((src_sz.y % 2 != 0) && (y == src_sz.y - 1))
131 const ne10_uint8_t *src_row1 = src + y * src_stride;
132 const ne10_uint8_t *src_row2 = src + (y + 1) * src_stride;
133 ne10_uint8_t *dst_row1 = dst + y * dst_stride;
134 ne10_uint8_t *dst_row2 = dst + (y + 1) * dst_stride;
135 ne10_int16_t sum[RGBA_CH * 2];
137 for (k = 0; k < RGBA_CH; k++)
142 for (x = 0; x < kernel.x; x++)
144 sum[k] += * (src_row1 + x * RGBA_CH + k);
145 sum[k + 4] += * (src_row2 + x * RGBA_CH + k);
148 *(dst_row1 + border_l * RGBA_CH + k) = sum[k] * mul >>
150 *(dst_row2 + border_l * RGBA_CH + k) = sum[k + 4] * mul >>
154 ne10_uint32_t prev = (anchor.x + 1) * RGBA_CH;
155 ne10_uint32_t next = (kernel.x - anchor.x - 1) * RGBA_CH;
156 const ne10_uint8_t *src_pixel1 = src_row1 + (1 + border_l) * RGBA_CH;
157 const ne10_uint8_t *src_pixel2 = src_row2 + (1 + border_l) * RGBA_CH;
158 const ne10_uint8_t *src_pixel_end = src_row1 + (src_sz.x - border_r) *
160 ne10_uint8_t *dst_pixel1 = dst_row1 + (1 + border_l) * RGBA_CH;
161 ne10_uint8_t *dst_pixel2 = dst_row2 + (1 + border_l) * RGBA_CH;
163 int16x8_t sum_vec = vld1q_s16 (sum);
165 uint16x8_t sum_vec_u;
166 uint8x8_t src_pixel_next_vec, src_pixel_prev_vec;
167 uint32x2_t src_pixel_next_tmp_vec, src_pixel_prev_tmp_vec;
168 uint32x2_t src_pixel_next_tmp_vec_pre, src_pixel_prev_tmp_vec_pre;
169 uint32x2_t dst_pixel_vec;
170 uint8x8_t dst_pixel_tmp_vec;
174 src_pixel_next_tmp_vec = vld1_lane_u32 (
175 (
const ne10_uint32_t*) (src_pixel1 + next),
176 src_pixel_next_tmp_vec,
178 src_pixel_prev_tmp_vec = vld1_lane_u32 (
179 (
const ne10_uint32_t*) (src_pixel1 - prev),
180 src_pixel_prev_tmp_vec,
182 src_pixel_next_tmp_vec = vld1_lane_u32 (
183 (
const ne10_uint32_t*) (src_pixel2 + next),
184 src_pixel_next_tmp_vec,
186 src_pixel_prev_tmp_vec = vld1_lane_u32 (
187 (
const ne10_uint32_t*) (src_pixel2 - prev),
188 src_pixel_prev_tmp_vec,
192 while (src_pixel1 < src_pixel_end)
195 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
196 (
const ne10_uint32_t*) (src_pixel1 + 4 + next),
197 src_pixel_next_tmp_vec_pre,
199 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
200 (
const ne10_uint32_t*) (src_pixel1 + 4 - prev),
201 src_pixel_prev_tmp_vec_pre,
203 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
204 (
const ne10_uint32_t*) (src_pixel2 + 4 + next),
205 src_pixel_next_tmp_vec_pre,
207 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
208 (
const ne10_uint32_t*) (src_pixel2 + 4 - prev),
209 src_pixel_prev_tmp_vec_pre,
212 src_pixel_prev_vec = vreinterpret_u8_u32 (src_pixel_prev_tmp_vec);
213 src_pixel_next_vec = vreinterpret_u8_u32 (src_pixel_next_tmp_vec);
215 sum_vec_u = vreinterpretq_u16_s16 (sum_vec);
216 sum_vec_u = vaddw_u8 (sum_vec_u, src_pixel_next_vec);
217 sum_vec_u = vsubw_u8 (sum_vec_u, src_pixel_prev_vec);
218 sum_vec = vreinterpretq_s16_u16 (sum_vec_u);
220 sum_tmp = vqdmulhq_s16 (sum_vec, mul_vec);
221 dst_pixel_tmp_vec = vqmovun_s16 (sum_tmp);
222 dst_pixel_vec = vreinterpret_u32_u8 (dst_pixel_tmp_vec);
223 vst1_lane_u32 ((ne10_uint32_t *) dst_pixel1, dst_pixel_vec, 0);
224 vst1_lane_u32 ((ne10_uint32_t *) dst_pixel2, dst_pixel_vec, 1);
226 src_pixel_prev_tmp_vec = src_pixel_prev_tmp_vec_pre;
227 src_pixel_next_tmp_vec = src_pixel_next_tmp_vec_pre;
237 void ne10_img_boxfilter_col_neon (
const ne10_uint8_t *src,
240 ne10_int32_t src_stride,
241 ne10_int32_t dst_stride,
244 ne10_int32_t border_t,
245 ne10_int32_t border_b)
248 if (kernel.y == 1 || kernel.y >= (1 << 7) || src_sz.x == 1)
250 return ne10_img_boxfilter_col_c (src,
262 assert (kernel.y > 0);
263 assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
267 assert ( (src_sz.x > 1) &&
268 (kernel.y < (1 << 7)) &&
271 ne10_int32_t x, y, k;
272 ne10_uint16_t *sum_row = (ne10_uint16_t *) malloc (src_sz.x *
274 sizeof (ne10_uint16_t));
275 ne10_uint16_t mul = (1 << DIV_SHIFT) / kernel.y;
280 "ERROR: buffer allocation fails!\nallocation size: %d\n",
281 sizeof (ne10_uint32_t) *
287 for (x = 0; x < src_sz.x * RGBA_CH; x++)
292 for (x = 0; x < src_sz.x; x++)
294 const ne10_uint8_t *src_col = src + x * RGBA_CH;
295 ne10_uint8_t *dst_col = dst + x * RGBA_CH;
296 ne10_uint8_t *dst_pixel = dst_col + border_t * dst_stride;
297 ne10_uint16_t *sum = sum_row + x * RGBA_CH;
299 for (y = 0; y < kernel.y; y++)
301 const ne10_uint8_t *src_pixel = src_col + y * src_stride;
303 for (k = 0; k < RGBA_CH; k++)
305 sum[k] += src_pixel[k];
309 for (k = 0; k < RGBA_CH; k++)
311 dst_pixel[k] = sum_row[x * RGBA_CH + k] * mul >>
316 const ne10_uint8_t *src_row = src + (1 + border_t) * src_stride;
317 const ne10_uint8_t *src_row_end = src + (src_sz.y - border_b) *
319 ne10_uint8_t *dst_row = dst + (1 + border_t) * dst_stride;
320 ne10_uint32_t prev = (anchor.y + 1) * src_stride;
321 ne10_uint32_t next = (kernel.y - anchor.y - 1) * src_stride;
323 uint16x8_t sum_vec, sum_vec_pre;
325 uint8x8_t src_pixel_prev_vec, src_pixel_next_vec;
326 uint8x8_t src_pixel_prev_vec_pre, src_pixel_next_vec_pre;
327 uint8x8_t dst_pixel_vec;
329 ne10_uint16_t sum_val_bakcup[RGBA_CH];
330 ne10_uint32_t src_sz_x_adjust = src_sz.x;
331 int16x8_t mul_vec = vdupq_n_s16 (mul);
333 if (src_sz.x % 2 != 0)
335 for (k = 0; k < RGBA_CH; k++)
336 sum_val_bakcup[k] = sum_row[ (src_sz.x - 2) * RGBA_CH + k];
337 src_sz_x_adjust = src_sz.x - 1;
343 while (src_row < src_row_end)
346 const ne10_uint8_t *src_pixel = src_row;
347 ne10_uint8_t *dst_pixel = dst_row;
348 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
349 src_pixel_next_vec = vld1_u8 (src_pixel + next);
350 ne10_uint16_t *sum, *sum_pre;
351 sum_vec = vld1q_u16 (sum_row);
353 for (x = 0; x < src_sz_x_adjust; x += 2)
355 sum = sum_row + x * RGBA_CH;
356 sum_pre = sum + 2 * RGBA_CH;
357 sum_vec_pre = vld1q_u16 (sum_pre);
359 src_pixel = src_row + (x + 2) * RGBA_CH;
360 src_pixel_prev_vec_pre = vld1_u8 (src_pixel - prev);
361 src_pixel_next_vec_pre = vld1_u8 (src_pixel + next);
363 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
364 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
365 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
367 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
368 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
369 dst_pixel = dst_row + x * RGBA_CH;
370 vst1_u8 (dst_pixel, dst_pixel_vec);
371 vst1q_u16 (sum, sum_vec);
373 src_pixel_next_vec = src_pixel_next_vec_pre;
374 src_pixel_prev_vec = src_pixel_prev_vec_pre;
375 sum_vec = sum_vec_pre;
377 src_row += src_stride;
378 dst_row += dst_stride;
381 if (src_sz.x % 2 != 0)
383 for (k = 0; k < RGBA_CH; k++)
384 sum_row[ (src_sz.x - 2) * RGBA_CH + k] = sum_val_bakcup[k];
386 src_row = src + (1 + border_t) * src_stride;
387 dst_row = dst + (1 + border_t) * dst_stride;
390 sum_vec = vld1q_u16 (sum_row + x * RGBA_CH);
392 while (src_row < src_row_end)
394 const ne10_uint8_t *src_pixel = src_row + x * RGBA_CH;
395 ne10_uint8_t *dst_pixel = dst_row + x * RGBA_CH;
396 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
397 src_pixel_next_vec = vld1_u8 (src_pixel + next);
398 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
399 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
400 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
403 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
404 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
405 vst1_u8 (dst_pixel, dst_pixel_vec);
407 src_row += src_stride;
408 dst_row += dst_stride;
437 ne10_int32_t src_stride,
438 ne10_int32_t dst_stride,
441 assert (src != 0 && dst != 0);
442 assert (src_sz.x > 0 && src_sz.y > 0);
443 assert (src_stride > 0 && dst_stride > 0);
444 assert (kernel.x > 0 && kernel.x <= src_sz.x
445 && kernel.y > 0 && kernel.y <= src_sz.y);
447 ne10_int32_t border_l, border_r, border_t, border_b;
450 anchor.x = kernel.x / 2;
451 anchor.y = kernel.y / 2;
455 ne10_uint32_t mem_bytes = (
sizeof (ne10_uint8_t) * src_sz.x * src_sz.y + 2)
458 ne10_uint8_t *dst_buf = (ne10_uint8_t *) malloc (mem_bytes);
463 "ERROR: buffer allocation fails!\nallocation size: %d\n",
468 ne10_int32_t dst_buf_stride = src_sz.x * RGBA_CH;
471 ne10_img_boxfilter_row_border (src,
483 ne10_img_boxfilter_row_neon (src,
496 ne10_img_boxfilter_col_border (dst_buf,
507 ne10_img_boxfilter_col_neon (dst_buf,
void ne10_img_boxfilter_rgba8888_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel)
neon optimized box filter
Structure for point in image.