28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
38 int32_t src_stride_2x = (src_stride << 1);
39 int32_t dst_stride_2x = (dst_stride << 1);
40 int32_t src_stride_4x = (src_stride << 2);
41 int32_t dst_stride_4x = (dst_stride << 2);
42 int32_t src_stride_3x = src_stride_2x + src_stride;
43 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
48 __m128i in0, in1, in2, in3;
53 src3 = __lsx_vldx(
src, src_stride_3x);
55 src4 = __lsx_vld(
src, 0);
58 src7 = __lsx_vldx(
src, src_stride_3x);
66 __lsx_vstelm_d(in0,
dst, 0, 0);
67 __lsx_vstelm_d(in0,
dst + dst_stride, 0, 1);
68 __lsx_vstelm_d(in1,
dst + dst_stride_2x, 0, 0);
69 __lsx_vstelm_d(in1,
dst + dst_stride_3x, 0, 1);
71 __lsx_vstelm_d(in2,
dst, 0, 0);
72 __lsx_vstelm_d(in2,
dst + dst_stride, 0, 1);
73 __lsx_vstelm_d(in3,
dst + dst_stride_2x, 0, 0);
74 __lsx_vstelm_d(in3,
dst + dst_stride_3x, 0, 1);
79 in0 = __lsx_vsllwil_hu_bu(
src0, 6);
80 __lsx_vstelm_d(in0,
dst, 0, 0);
92 int32_t src_stride_2x = (src_stride << 1);
93 int32_t src_stride_4x = (src_stride << 2);
94 int32_t src_stride_3x = src_stride_2x + src_stride;
97 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
99 for (loop_cnt = (
height >> 3); loop_cnt--;) {
102 src3 = __lsx_vldx(
src, src_stride_3x);
103 src += src_stride_4x;
104 src4 = __lsx_vld(
src, 0);
105 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
106 src7 = __lsx_vldx(
src, src_stride_3x);
107 src += src_stride_4x;
111 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
114 __lsx_vstelm_d(in0,
dst, 0, 0);
115 __lsx_vstelm_w(in0,
dst, 8, 2);
117 __lsx_vstelm_d(in1,
dst, 0, 0);
118 __lsx_vstelm_w(in1,
dst, 8, 2);
120 __lsx_vstelm_d(in2,
dst, 0, 0);
121 __lsx_vstelm_w(in2,
dst, 8, 2);
123 __lsx_vstelm_d(in3,
dst, 0, 0);
124 __lsx_vstelm_w(in3,
dst, 8, 2);
126 __lsx_vstelm_d(in4,
dst, 0, 0);
127 __lsx_vstelm_w(in4,
dst, 8, 2);
129 __lsx_vstelm_d(in5,
dst, 0, 0);
130 __lsx_vstelm_w(in5,
dst, 8, 2);
132 __lsx_vstelm_d(in6,
dst, 0, 0);
133 __lsx_vstelm_w(in6,
dst, 8, 2);
135 __lsx_vstelm_d(in7,
dst, 0, 0);
136 __lsx_vstelm_w(in7,
dst, 8, 2);
141 in0 = __lsx_vsllwil_hu_bu(
src0, 6);
143 __lsx_vstelm_d(in0,
dst, 0, 0);
144 __lsx_vstelm_w(in0,
dst, 8, 2);
153 int32_t src_stride_2x = (src_stride << 1);
154 int32_t dst_stride_x = (dst_stride << 1);
155 int32_t src_stride_4x = (src_stride << 2);
156 int32_t dst_stride_2x = (dst_stride_x << 1);
157 int32_t src_stride_3x = src_stride_2x + src_stride;
158 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
162 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
164 for (loop_cnt = (
height >> 3); loop_cnt--;) {
168 src3 = __lsx_vldx(
src, src_stride_3x);
169 src += src_stride_4x;
170 src4 = __lsx_vld(
src, 0);
173 src7 = __lsx_vldx(
src, src_stride_3x);
174 src += src_stride_4x;
178 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
180 __lsx_vst(in0,
dst, 0);
181 __lsx_vstx(in1,
dst, dst_stride_x);
182 __lsx_vstx(in2,
dst, dst_stride_2x);
183 __lsx_vstx(in3,
dst, dst_stride_3x);
184 dst += dst_stride_2x;
185 __lsx_vst(in4,
dst, 0);
186 __lsx_vstx(in5,
dst, dst_stride_x);
187 __lsx_vstx(in6,
dst, dst_stride_2x);
188 __lsx_vstx(in7,
dst, dst_stride_3x);
189 dst += dst_stride_2x;
193 in0 = __lsx_vsllwil_hu_bu(
src0, 6);
194 __lsx_vst(in0,
dst, 0);
205 uint32_t res =
height & 0x07;
206 int32_t src_stride_2x = (src_stride << 1);
207 int32_t dst_stride_x = (dst_stride << 1);
208 int32_t src_stride_4x = (src_stride << 2);
209 int32_t dst_stride_2x = (dst_stride_x << 1);
210 int32_t src_stride_3x = src_stride_2x + src_stride;
211 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
212 __m128i
zero = __lsx_vldi(0);
214 __m128i in0, in1, in0_r, in1_r, in2_r, in3_r;
216 for (loop_cnt = (
height >> 3); loop_cnt--;) {
219 src3 = __lsx_vldx(
src, src_stride_3x);
220 src += src_stride_4x;
221 src4 = __lsx_vld(
src, 0);
222 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
223 src7 = __lsx_vldx(
src, src_stride_3x);
224 src += src_stride_4x;
227 in0_r, in1_r, in2_r, in3_r);
230 __lsx_vst(in0_r,
dst, 0);
231 __lsx_vstx(in1_r,
dst, dst_stride_x);
232 __lsx_vstx(in2_r,
dst, dst_stride_2x);
233 __lsx_vstx(in3_r,
dst, dst_stride_3x);
234 __lsx_vstelm_d(in0,
dst, 16, 0);
236 __lsx_vstelm_d(in0,
dst, 16, 1);
238 __lsx_vstelm_d(in1,
dst, 16, 0);
240 __lsx_vstelm_d(in1,
dst, 16, 1);
243 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
244 in0_r, in1_r, in2_r, in3_r);
247 __lsx_vst(in0_r,
dst, 0);
248 __lsx_vstx(in1_r,
dst, dst_stride_x);
249 __lsx_vstx(in2_r,
dst, dst_stride_2x);
250 __lsx_vstx(in3_r,
dst, dst_stride_3x);
251 __lsx_vstelm_d(in0,
dst, 16, 0);
253 __lsx_vstelm_d(in0,
dst, 16, 1);
255 __lsx_vstelm_d(in1,
dst, 16, 0);
257 __lsx_vstelm_d(in1,
dst, 16, 1);
262 in0 = __lsx_vsllwil_hu_bu(
src0, 6);
264 in1 = __lsx_vslli_h(
src1, 6);
265 __lsx_vst(in0,
dst, 0);
266 __lsx_vstelm_d(in1,
dst, 16, 0);
276 __m128i
zero = __lsx_vldi(0);
277 int32_t src_stride_2x = (src_stride << 1);
278 int32_t dst_stride_x = (dst_stride << 1);
279 int32_t src_stride_4x = (src_stride << 2);
280 int32_t dst_stride_2x = (dst_stride << 2);
281 int32_t src_stride_3x = src_stride_2x + src_stride;
282 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
285 int16_t* dst1 =
dst + 8;
287 __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
289 for (loop_cnt = (
height >> 3); loop_cnt--;) {
293 src3 = __lsx_vldx(
src, src_stride_3x);
294 src += src_stride_4x;
295 src4 = __lsx_vld(
src, 0);
298 src7 = __lsx_vldx(
src, src_stride_3x);
299 src += src_stride_4x;
301 in0_l, in1_l, in2_l, in3_l);
303 in0_r, in1_r, in2_r, in3_r);
304 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
305 in1_l, in2_l, in3_l);
306 __lsx_vst(in0_r,
dst, 0);
307 __lsx_vstx(in1_r,
dst, dst_stride_x);
308 __lsx_vstx(in2_r,
dst, dst_stride_2x);
309 __lsx_vstx(in3_r,
dst, dst_stride_3x);
310 __lsx_vst(in0_l, dst1, 0);
311 __lsx_vstx(in1_l, dst1, dst_stride_x);
312 __lsx_vstx(in2_l, dst1, dst_stride_2x);
313 __lsx_vstx(in3_l, dst1, dst_stride_3x);
314 dst += dst_stride_2x;
315 dst1 += dst_stride_2x;
318 in0_l, in1_l, in2_l, in3_l);
319 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
320 in0_r, in1_r, in2_r, in3_r);
321 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
322 in1_l, in2_l, in3_l);
323 __lsx_vst(in0_r,
dst, 0);
324 __lsx_vstx(in1_r,
dst, dst_stride_x);
325 __lsx_vstx(in2_r,
dst, dst_stride_2x);
326 __lsx_vstx(in3_r,
dst, dst_stride_3x);
327 __lsx_vst(in0_l, dst1, 0);
328 __lsx_vstx(in1_l, dst1, dst_stride_x);
329 __lsx_vstx(in2_l, dst1, dst_stride_2x);
330 __lsx_vstx(in3_l, dst1, dst_stride_3x);
331 dst += dst_stride_2x;
332 dst1 += dst_stride_2x;
337 src3 = __lsx_vldx(
src, src_stride_3x);
340 in0_l, in1_l, in2_l, in3_l);
342 in0_r, in1_r, in2_r, in3_r);
343 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
344 in1_l, in2_l, in3_l);
345 __lsx_vst(in0_r,
dst, 0);
346 __lsx_vstx(in1_r,
dst, dst_stride_x);
347 __lsx_vstx(in2_r,
dst, dst_stride_2x);
348 __lsx_vstx(in3_r,
dst, dst_stride_3x);
350 __lsx_vst(in0_l,
dst, 0);
351 __lsx_vstx(in1_l,
dst, dst_stride_x);
352 __lsx_vstx(in2_l,
dst, dst_stride_2x);
353 __lsx_vstx(in3_l,
dst, dst_stride_3x);
362 int32_t src_stride_2x = (src_stride << 1);
363 int32_t dst_stride_x = (dst_stride << 1);
364 int32_t src_stride_4x = (src_stride << 2);
365 int32_t dst_stride_2x = (dst_stride << 2);
366 int32_t src_stride_3x = src_stride_2x + src_stride;
367 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
368 const uint8_t *
_src =
src + 16;
370 __m128i
zero = __lsx_vldi(0);
372 __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
374 for (loop_cnt = (
height >> 2); loop_cnt--;) {
377 src3 = __lsx_vldx(
src, src_stride_3x);
378 src += src_stride_4x;
379 src4 = __lsx_vld(
_src, 0);
382 src7 = __lsx_vldx(
_src, src_stride_3x);
383 _src += src_stride_4x;
386 src3, in0_l, in1_l, in2_l, in3_l);
388 in0_r, in1_r, in2_r, in3_r);
389 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
390 in0_l, in1_l, in2_l, in3_l);
391 __lsx_vst(in0_r,
dst, 0);
392 __lsx_vstx(in1_r,
dst, dst_stride_x);
393 __lsx_vstx(in2_r,
dst, dst_stride_2x);
394 __lsx_vstx(in3_r,
dst, dst_stride_3x);
396 __lsx_vst(in0_l, dst1, 0);
397 __lsx_vstx(in1_l, dst1, dst_stride_x);
398 __lsx_vstx(in2_l, dst1, dst_stride_2x);
399 __lsx_vstx(in3_l, dst1, dst_stride_3x);
400 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
401 in0_r, in1_r, in2_r, in3_r);
403 __lsx_vst(in0_r, dst1, 0);
404 __lsx_vstx(in1_r, dst1, dst_stride_x);
405 __lsx_vstx(in2_r, dst1, dst_stride_2x);
406 __lsx_vstx(in3_r, dst1, dst_stride_3x);
407 dst += dst_stride_2x;
416 int32_t src_stride_2x = (src_stride << 1);
417 int32_t src_stride_4x = (src_stride << 2);
418 int32_t src_stride_3x = src_stride_2x + src_stride;
419 const uint8_t *
_src =
src + 16;
422 __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
424 for (loop_cnt = (
height >> 2); loop_cnt--;) {
427 src6 = __lsx_vldx(
src, src_stride_3x);
428 src += src_stride_4x;
432 src7 = __lsx_vldx(
_src, src_stride_3x);
433 _src += src_stride_4x;
436 src3, in0_l, in1_l, in2_l, in3_l);
438 in0_r, in1_r, in2_r, in3_r);
439 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
440 in0_l, in1_l, in2_l, in3_l);
441 __lsx_vst(in0_r,
dst, 0);
442 __lsx_vst(in0_l,
dst, 16);
443 __lsx_vst(in1_r,
dst, 32);
444 __lsx_vst(in1_l,
dst, 48);
446 __lsx_vst(in2_r,
dst, 0);
447 __lsx_vst(in2_l,
dst, 16);
448 __lsx_vst(in3_r,
dst, 32);
449 __lsx_vst(in3_l,
dst, 48);
453 in0_l, in1_l, in2_l, in3_l);
454 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
455 in0_r, in1_r, in2_r, in3_r);
456 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
457 in1_l, in2_l, in3_l);
458 __lsx_vst(in0_r,
dst, 0);
459 __lsx_vst(in0_l,
dst, 16);
460 __lsx_vst(in1_r,
dst, 32);
461 __lsx_vst(in1_l,
dst, 48);
463 __lsx_vst(in2_r,
dst, 0);
464 __lsx_vst(in2_l,
dst, 16);
465 __lsx_vst(in3_r,
dst, 32);
466 __lsx_vst(in3_l,
dst, 48);
478 __m128i src8, src9, src10, src11;
479 __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
480 __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
482 for (loop_cnt = (
height >> 2); loop_cnt--;) {
487 src5 = __lsx_vld(
src, 32);
490 src8 = __lsx_vld(
src, 32);
493 src11 = __lsx_vld(
src, 32);
497 src3, in0_l, in1_l, in2_l, in3_l);
500 in0_r, in1_r, in2_r, in3_r);
501 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
502 in1_l, in2_l, in3_l);
503 DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r);
504 DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
505 __lsx_vst(in0_r,
dst, 0);
506 __lsx_vst(in0_l,
dst, 16);
507 __lsx_vst(in1_r,
dst, 32);
508 __lsx_vst(in1_l,
dst, 48);
509 __lsx_vst(in2_r,
dst, 64);
510 __lsx_vst(in2_l,
dst, 80);
512 __lsx_vst(in3_r,
dst, 0);
513 __lsx_vst(in3_l,
dst, 16);
514 __lsx_vst(in4_r,
dst, 32);
515 __lsx_vst(in4_l,
dst, 48);
516 __lsx_vst(in5_r,
dst, 64);
517 __lsx_vst(in5_l,
dst, 80);
521 in0_l, in1_l, in2_l, in3_l);
523 DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6,
524 in0_r, in1_r, in2_r, in3_r);
525 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
526 in1_l, in2_l, in3_l);
527 DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r);
528 DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
529 __lsx_vst(in0_r,
dst, 0);
530 __lsx_vst(in0_l,
dst, 16);
531 __lsx_vst(in1_r,
dst, 32);
532 __lsx_vst(in1_l,
dst, 48);
533 __lsx_vst(in2_r,
dst, 64);
534 __lsx_vst(in2_l,
dst, 80);
536 __lsx_vst(in3_r,
dst, 0);
537 __lsx_vst(in3_l,
dst, 16);
538 __lsx_vst(in4_r,
dst, 32);
539 __lsx_vst(in4_l,
dst, 48);
540 __lsx_vst(in5_r,
dst, 64);
541 __lsx_vst(in5_l,
dst, 80);
553 __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
556 for (loop_cnt = (
height >> 1); loop_cnt--;) {
561 src4, src5, src6, src7);
565 src3, in0_l, in1_l, in2_l, in3_l);
567 in0_r, in1_r, in2_r, in3_r);
568 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
569 in0_l, in1_l, in2_l, in3_l);
570 __lsx_vst(in0_r,
dst, 0);
571 __lsx_vst(in0_l,
dst, 16);
572 __lsx_vst(in1_r,
dst, 32);
573 __lsx_vst(in1_l,
dst, 48);
574 __lsx_vst(in2_r,
dst, 64);
575 __lsx_vst(in2_l,
dst, 80);
576 __lsx_vst(in3_r,
dst, 96);
577 __lsx_vst(in3_l,
dst, 112);
581 src7, in0_l, in1_l, in2_l, in3_l);
582 DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
583 in0_r, in1_r, in2_r, in3_r);
584 DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
585 in1_l, in2_l, in3_l);
586 __lsx_vst(in0_r,
dst, 0);
587 __lsx_vst(in0_l,
dst, 16);
588 __lsx_vst(in1_r,
dst, 32);
589 __lsx_vst(in1_l,
dst, 48);
590 __lsx_vst(in2_r,
dst, 64);
591 __lsx_vst(in2_l,
dst, 80);
592 __lsx_vst(in3_r,
dst, 96);
593 __lsx_vst(in3_l,
dst, 112);
602 uint32_t loop_cnt =
height >> 3;
603 uint32_t res = (
height & 0x7) >> 1;
604 int32_t src_stride_2x = (src_stride << 1);
605 int32_t dst_stride_2x = (dst_stride << 1);
606 int32_t src_stride_4x = (src_stride << 2);
607 int32_t dst_stride_4x = (dst_stride << 2);
608 int32_t src_stride_3x = src_stride_2x + src_stride;
609 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
611 __m128i filt0, filt1, filt2, filt3;
612 __m128i mask1, mask2, mask3;
613 __m128i vec0, vec1, vec2, vec3;
614 __m128i dst0, dst1, dst2, dst3;
619 filt0, filt1, filt2, filt3);
621 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
622 mask3 = __lsx_vaddi_bu(mask0, 6);
627 src3 = __lsx_vldx(
src, src_stride_3x);
628 src += src_stride_4x;
629 src4 = __lsx_vld(
src, 0);
630 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
631 src7 = __lsx_vldx(
src, src_stride_3x);
632 src += src_stride_4x;
634 src0, mask2,
src1,
src0, mask3, vec0, vec1, vec2, vec3);
635 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
636 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
638 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
640 src2, mask2, src3,
src2, mask3, vec0, vec1, vec2, vec3);
641 dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
642 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
644 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
645 DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5,
646 src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3);
647 dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
648 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
650 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
651 DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7,
652 src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3);
653 dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
654 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
656 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
658 __lsx_vstelm_d(dst0,
dst, 0, 0);
659 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
660 __lsx_vstelm_d(dst1,
dst + dst_stride_2x, 0, 0);
661 __lsx_vstelm_d(dst1,
dst + dst_stride_3x, 0, 1);
662 dst += dst_stride_4x;
663 __lsx_vstelm_d(dst2,
dst, 0, 0);
664 __lsx_vstelm_d(dst2,
dst + dst_stride, 0, 1);
665 __lsx_vstelm_d(dst3,
dst + dst_stride_2x, 0, 0);
666 __lsx_vstelm_d(dst3,
dst + dst_stride_3x, 0, 1);
667 dst += dst_stride_4x;
671 src1 = __lsx_vldx(
src, src_stride);
673 src0, mask2,
src1,
src0, mask3, vec0, vec1, vec2, vec3);
674 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
675 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
677 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
678 __lsx_vstelm_d(dst0,
dst, 0, 0);
679 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
680 src += src_stride_2x;
681 dst += dst_stride_2x;
690 int32_t src_stride_2x = (src_stride << 1);
691 int32_t dst_stride_x = (dst_stride << 1);
692 int32_t src_stride_4x = (src_stride << 2);
693 int32_t dst_stride_2x = (dst_stride << 2);
694 int32_t src_stride_3x = src_stride_2x + src_stride;
695 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
697 __m128i filt0, filt1, filt2, filt3;
698 __m128i mask1, mask2, mask3;
699 __m128i vec0, vec1, vec2, vec3;
700 __m128i dst0, dst1, dst2, dst3;
705 filt0, filt1, filt2, filt3);
707 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
708 mask3 = __lsx_vaddi_bu(mask0, 6);
710 for (loop_cnt = (
height >> 2); loop_cnt--;) {
713 src3 = __lsx_vldx(
src, src_stride_3x);
714 src += src_stride_4x;
717 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
718 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
719 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
721 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
723 src1, mask2,
src1,
src1, mask3, vec0, vec1, vec2, vec3);
724 dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
725 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
727 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
729 src2, mask2,
src2,
src2, mask3, vec0, vec1, vec2, vec3);
730 dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
731 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
733 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
734 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
735 src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
736 dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
737 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
739 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
741 __lsx_vst(dst0,
dst, 0);
742 __lsx_vstx(dst1,
dst, dst_stride_x);
743 __lsx_vstx(dst2,
dst, dst_stride_2x);
744 __lsx_vstx(dst3,
dst, dst_stride_3x);
745 dst += dst_stride_2x;
754 int32_t src_stride_2x = (src_stride << 1);
755 int32_t src_stride_4x = (src_stride << 2);
756 int32_t src_stride_3x = src_stride_2x + src_stride;
759 __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
760 __m128i vec0, vec1, vec2, vec3, vec4, vec5;
761 __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
766 filt0, filt1, filt2, filt3);
769 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
770 mask3 = __lsx_vaddi_bu(mask0, 6);
772 DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
773 mask7 = __lsx_vaddi_bu(mask4, 6);
775 for (loop_cnt =
height >> 2; loop_cnt--;) {
778 src3 = __lsx_vldx(
src, src_stride_3x);
779 src4 = __lsx_vld(
_src, 0);
781 src7 = __lsx_vldx(
_src, src_stride_3x);
782 src += src_stride_4x;
783 _src += src_stride_4x;
787 DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4, vec4, vec5);
788 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
789 vec3, filt0, dst0, dst1, dst2, dst3);
790 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
793 DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5, vec4, vec5);
794 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
795 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
796 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1, dst4, dst5);
799 DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6, vec4, vec5);
800 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
801 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
802 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2, dst4, dst5);
805 DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7, vec4, vec5);
806 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
807 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
808 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3, dst4, dst5);
810 __lsx_vst(dst0,
dst, 0);
811 __lsx_vstelm_d(dst4,
dst, 16, 0);
813 __lsx_vst(dst1,
dst, 0);
814 __lsx_vstelm_d(dst4,
dst, 16, 1);
816 __lsx_vst(dst2,
dst, 0);
817 __lsx_vstelm_d(dst5,
dst, 16, 0);
819 __lsx_vst(dst3,
dst, 0);
820 __lsx_vstelm_d(dst5,
dst, 16, 1);
831 __m128i filt0, filt1, filt2, filt3;
832 __m128i mask1, mask2, mask3;
833 __m128i vec0, vec1, vec2, vec3;
834 __m128i dst0, dst1, dst2, dst3;
840 filt0, filt1, filt2, filt3);
842 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
843 mask3 = __lsx_vaddi_bu(mask0, 6);
845 for (loop_cnt = (
height >> 1); loop_cnt--;) {
855 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
856 vec3, filt0, dst0, dst1, dst2, dst3);
861 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
862 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
867 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
868 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
873 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
874 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
876 __lsx_vst(dst0,
dst, 0);
877 __lsx_vst(dst1,
dst, 16);
879 __lsx_vst(dst2,
dst, 0);
880 __lsx_vst(dst3,
dst, 16);
891 __m128i filt0, filt1, filt2, filt3;
892 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
893 __m128i vec0, vec1, vec2, vec3, vec4, vec5;
894 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
899 filt0, filt1, filt2, filt3);
901 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
902 mask2, mask3, mask4);
903 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
904 mask7 = __lsx_vaddi_bu(mask0, 14);
906 for (loop_cnt = (
height >> 1); loop_cnt--;) {
913 src1, mask0,
src2,
src2, mask0, vec0, vec1, vec2, vec3);
914 DUP2_ARG3(__lsx_vshuf_b, src3,
src2, mask4, src3, src3, mask0,
916 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
917 vec3, filt0, dst0, dst1, dst2, dst3);
918 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
920 src1, mask1,
src2,
src2, mask1, vec0, vec1, vec2, vec3);
921 DUP2_ARG3(__lsx_vshuf_b, src3,
src2, mask5, src3, src3, mask1,
923 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
924 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
925 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
928 src1, mask2,
src2,
src2, mask2, vec0, vec1, vec2, vec3);
929 DUP2_ARG3(__lsx_vshuf_b, src3,
src2, mask6, src3, src3, mask2,
931 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
932 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
933 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
936 src1, mask3,
src2,
src2, mask3, vec0, vec1, vec2, vec3);
937 DUP2_ARG3(__lsx_vshuf_b, src3,
src2, mask7, src3, src3, mask3,
939 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
940 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
941 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
944 __lsx_vst(dst0,
dst, 0);
945 __lsx_vst(dst1,
dst, 16);
946 __lsx_vst(dst2,
dst, 32);
948 __lsx_vst(dst3,
dst, 0);
949 __lsx_vst(dst4,
dst, 16);
950 __lsx_vst(dst5,
dst, 32);
961 __m128i filt0, filt1, filt2, filt3;
962 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
963 __m128i vec0, vec1, vec2, vec3;
964 __m128i dst0, dst1, dst2, dst3;
969 filt0, filt1, filt2, filt3);
971 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8,
972 mask1, mask2, mask3, mask4);
973 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
974 mask7 = __lsx_vaddi_bu(mask0, 14);
976 for (loop_cnt =
height; loop_cnt--;) {
982 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
983 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
984 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
986 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
989 src0, mask6,
src1,
src0, mask7, vec0, vec1, vec2, vec3);
990 dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
991 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
993 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
995 mask2,
src1,
src1, mask3, vec0, vec1, vec2, vec3);
996 dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
997 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
999 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
1001 mask2,
src2,
src2, mask3, vec0, vec1, vec2, vec3);
1002 dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
1003 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
1005 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
1007 __lsx_vst(dst0,
dst, 0);
1008 __lsx_vst(dst1,
dst, 16);
1009 __lsx_vst(dst2,
dst, 32);
1010 __lsx_vst(dst3,
dst, 48);
1021 __m128i filt0, filt1, filt2, filt3;
1022 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1023 __m128i vec0, vec1, vec2, vec3, vec4, vec5;
1024 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1029 filt0, filt1, filt2, filt3);
1031 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
1032 mask2, mask3, mask4);
1033 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
1034 mask7 = __lsx_vaddi_bu(mask0, 14);
1036 for (loop_cnt =
height; loop_cnt--;) {
1039 src3 = __lsx_vld(
src, 40);
1043 mask0,
src2,
src1, mask4, vec0, vec1, vec2, vec3);
1044 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1045 vec3, filt0, dst0, dst1, dst2, dst3);
1047 src1, mask1,
src2,
src1, mask5, vec0, vec1, vec2, vec3);
1048 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1049 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1051 src1, mask2,
src2,
src1, mask6, vec0, vec1, vec2, vec3);
1052 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
1053 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
1055 src1, mask3,
src2,
src1, mask7, vec0, vec1, vec2, vec3);
1056 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
1057 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
1058 __lsx_vst(dst0,
dst, 0);
1059 __lsx_vst(dst1,
dst, 16);
1060 __lsx_vst(dst2,
dst, 32);
1061 __lsx_vst(dst3,
dst, 48);
1065 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
1068 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
1072 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
1076 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
1078 __lsx_vst(dst4,
dst, 64);
1079 __lsx_vst(dst5,
dst, 80);
1090 __m128i filt0, filt1, filt2, filt3;
1091 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1092 __m128i vec0, vec1, vec2, vec3;
1093 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1098 filt0, filt1, filt2, filt3);
1100 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
1101 mask2, mask3, mask4);
1102 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6)
1103 mask7 = __lsx_vaddi_bu(mask0, 14);
1105 for (loop_cnt =
height; loop_cnt--;) {
1108 src4 = __lsx_vld(
src, 56);
1112 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
1113 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1114 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1116 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1117 __lsx_vst(dst0,
dst, 0);
1120 src0, mask6,
src1,
src0, mask7, vec0, vec1, vec2, vec3);
1121 dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
1122 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
1124 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
1125 __lsx_vst(dst1,
dst, 16);
1128 src1, mask2,
src1,
src1, mask3, vec0, vec1, vec2, vec3);
1129 dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
1130 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
1132 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
1133 __lsx_vst(dst2,
dst, 32);
1136 src1, mask6,
src2,
src1, mask7, vec0, vec1, vec2, vec3);
1137 dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
1138 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
1140 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
1141 __lsx_vst(dst3,
dst, 48);
1144 src2, mask2,
src2,
src2, mask3, vec0, vec1, vec2, vec3);
1145 dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
1146 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
1148 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
1149 __lsx_vst(dst4,
dst, 64);
1152 src2, mask6, src3,
src2, mask7, vec0, vec1, vec2, vec3);
1153 dst5 = __lsx_vdp2_h_bu_b(vec0, filt0);
1154 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2,
1156 dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3);
1157 __lsx_vst(dst5,
dst, 80);
1159 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
1160 src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
1161 dst6 = __lsx_vdp2_h_bu_b(vec0, filt0);
1162 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2,
1164 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3);
1165 __lsx_vst(dst6,
dst, 96);
1167 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
1168 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
1169 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
1170 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
1172 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
1173 __lsx_vst(dst7,
dst, 112);
1184 int32_t src_stride_2x = (src_stride << 1);
1185 int32_t src_stride_4x = (src_stride << 2);
1186 int32_t src_stride_3x = src_stride_2x + src_stride;
1187 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1188 __m128i src9, src10, src11, src12, src13, src14;
1189 __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1190 __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1191 __m128i src1110_r, src1211_r, src1312_r, src1413_r;
1192 __m128i src2110, src4332, src6554, src8776, src10998;
1193 __m128i src12111110, src14131312;
1194 __m128i dst10, dst32, dst54, dst76;
1195 __m128i filt0, filt1, filt2, filt3;
1197 src -= src_stride_3x;
1200 filt0, filt1, filt2, filt3);
1204 src3 = __lsx_vldx(
src, src_stride_3x);
1205 src += src_stride_4x;
1206 src4 = __lsx_vld(
src, 0);
1207 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
1208 src += src_stride_3x;
1210 src10_r, src32_r, src54_r, src21_r);
1211 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1212 DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r,
1214 src6554 = __lsx_vilvl_d(src65_r, src54_r);
1216 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1217 src7 = __lsx_vld(
src, 0);
1218 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
1219 src10 = __lsx_vldx(
src, src_stride_3x);
1220 src += src_stride_4x;
1221 src11 = __lsx_vld(
src, 0);
1224 src14 = __lsx_vldx(
src, src_stride_3x);
1225 src += src_stride_4x;
1227 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
1228 src76_r, src87_r, src98_r, src109_r);
1229 DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14,
1230 src13, src1110_r, src1211_r, src1312_r, src1413_r);
1231 DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r,
1232 src1110_r, src1413_r, src1312_r, src8776, src10998,
1233 src12111110, src14131312);
1235 dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
1236 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
1237 filt2, dst10, dst10);
1238 dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
1239 dst32 = __lsx_vdp2_h_bu_b(src4332, filt0);
1240 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776,
1241 filt2, dst32, dst32);
1242 dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3);
1243 dst54 = __lsx_vdp2_h_bu_b(src6554, filt0);
1244 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1,
1245 dst54, src10998, filt2, dst54, dst54);
1246 dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3);
1247 dst76 = __lsx_vdp2_h_bu_b(src8776, filt0);
1248 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76,
1249 src12111110, filt2, dst76, dst76);
1250 dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3);
1252 __lsx_vstelm_d(dst10,
dst, 0, 0);
1254 __lsx_vstelm_d(dst10,
dst, 0, 1);
1256 __lsx_vstelm_d(dst32,
dst, 0, 0);
1258 __lsx_vstelm_d(dst32,
dst, 0, 1);
1260 __lsx_vstelm_d(dst54,
dst, 0, 0);
1262 __lsx_vstelm_d(dst54,
dst, 0, 1);
1264 __lsx_vstelm_d(dst76,
dst, 0, 0);
1266 __lsx_vstelm_d(dst76,
dst, 0, 1);
1270 src4332 = src12111110;
1271 src6554 = src14131312;
1275 src7 = __lsx_vld(
src, 0);
1276 src8 = __lsx_vldx(
src, src_stride);
1277 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
1278 src += src_stride_2x;
1279 src8776 = __lsx_vilvl_d(src87_r, src76_r);
1281 dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
1282 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
1283 filt2, dst10, dst10);
1284 dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
1286 __lsx_vstelm_d(dst10,
dst, 0, 0);
1288 __lsx_vstelm_d(dst10,
dst, 0, 1);
1303 int32_t src_stride_2x = (src_stride << 1);
1304 int32_t dst_stride_x = (dst_stride << 1);
1305 int32_t src_stride_4x = (src_stride << 2);
1306 int32_t dst_stride_2x = (dst_stride << 2);
1307 int32_t src_stride_3x = src_stride_2x + src_stride;
1308 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
1309 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1310 __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1311 __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1312 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1313 __m128i filt0, filt1, filt2, filt3;
1315 src -= src_stride_3x;
1318 filt0, filt1, filt2, filt3);
1322 src3 = __lsx_vldx(
src, src_stride_3x);
1323 src += src_stride_4x;
1324 src4 = __lsx_vld(
src, 0);
1325 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
1326 src += src_stride_3x;
1328 src10_r, src32_r, src54_r, src21_r);
1329 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1331 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1332 src7 = __lsx_vld(
src, 0);
1333 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
1334 src10 = __lsx_vldx(
src, src_stride_3x);
1335 src += src_stride_4x;
1336 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1337 src9, src76_r, src87_r, src98_r, src109_r);
1339 dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1340 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1341 src54_r, filt2, dst0_r, dst0_r);
1342 dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1343 dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1344 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1345 src65_r, filt2, dst1_r, dst1_r);
1346 dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1347 dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1348 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1349 src76_r, filt2, dst2_r, dst2_r);
1350 dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1351 dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1352 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1353 src87_r, filt2, dst3_r, dst3_r);
1354 dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1356 __lsx_vst(dst0_r,
dst, 0);
1357 __lsx_vstx(dst1_r,
dst, dst_stride_x);
1358 __lsx_vstx(dst2_r,
dst, dst_stride_2x);
1359 __lsx_vstx(dst3_r,
dst, dst_stride_3x);
1360 dst += dst_stride_2x;
1377 int32_t src_stride_2x = (src_stride << 1);
1378 int32_t src_stride_4x = (src_stride << 2);
1379 int32_t src_stride_3x = src_stride_2x + src_stride;
1380 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1381 __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1382 __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1383 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1384 __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
1385 __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
1386 __m128i src2110, src4332, src6554, src8776, src10998;
1387 __m128i dst0_l, dst1_l;
1388 __m128i filt0, filt1, filt2, filt3;
1390 src -= src_stride_3x;
1393 filt0, filt1, filt2, filt3);
1396 src3 = __lsx_vldx(
src, src_stride_3x);
1397 src += src_stride_4x;
1398 src4 = __lsx_vld(
src, 0);
1399 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
1400 src += src_stride_3x;
1402 src10_r, src32_r, src54_r, src21_r);
1403 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1405 src10_l, src32_l, src54_l, src21_l);
1406 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
1407 DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l,
1409 src6554 = __lsx_vilvl_d(src65_l, src54_l);
1411 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1412 src7 = __lsx_vld(
src, 0);
1413 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
1414 src10 = __lsx_vldx(
src, src_stride_3x);
1415 src += src_stride_4x;
1416 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1417 src9, src76_r, src87_r, src98_r, src109_r);
1418 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
1419 src9, src76_l, src87_l, src98_l, src109_l);
1420 DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l,
1423 dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1424 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1425 src54_r, filt2, dst0_r, dst0_r);
1426 dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1427 dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1428 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1429 src65_r, filt2, dst1_r, dst1_r);
1430 dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1431 dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1432 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1433 src76_r, filt2, dst2_r, dst2_r);
1434 dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1435 dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1436 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1437 src87_r, filt2, dst3_r, dst3_r);
1438 dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1439 dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0);
1440 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l,
1441 src6554, filt2, dst0_l, dst0_l);
1442 dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3);
1443 dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0);
1444 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l,
1445 src8776, filt2, dst1_l, dst1_l);
1446 dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3);
1448 __lsx_vst(dst0_r,
dst, 0);
1449 __lsx_vstelm_d(dst0_l,
dst, 16, 0);
1451 __lsx_vst(dst1_r,
dst, 0);
1452 __lsx_vstelm_d(dst0_l,
dst, 16, 1);
1454 __lsx_vst(dst2_r,
dst, 0);
1455 __lsx_vstelm_d(dst1_l,
dst, 16, 0);
1457 __lsx_vst(dst3_r,
dst, 0);
1458 __lsx_vstelm_d(dst1_l,
dst, 16, 1);
1482 const uint8_t *src_tmp;
1485 int32_t src_stride_2x = (src_stride << 1);
1486 int32_t src_stride_4x = (src_stride << 2);
1487 int32_t src_stride_3x = src_stride_2x + src_stride;
1488 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1489 __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1490 __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1491 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1492 __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
1493 __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
1494 __m128i dst0_l, dst1_l, dst2_l, dst3_l;
1495 __m128i filt0, filt1, filt2, filt3;
1497 src -= src_stride_3x;
1500 filt0, filt1, filt2, filt3);
1502 for (cnt =
width >> 4; cnt--;) {
1506 src0 = __lsx_vld(src_tmp, 0);
1507 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1509 src3 = __lsx_vldx(src_tmp, src_stride_3x);
1510 src_tmp += src_stride_4x;
1511 src4 = __lsx_vld(src_tmp, 0);
1512 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1514 src_tmp += src_stride_3x;
1516 src10_r, src32_r, src54_r, src21_r);
1517 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1519 src10_l, src32_l, src54_l, src21_l);
1520 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
1522 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1523 src7 = __lsx_vld(src_tmp, 0);
1524 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1526 src10 = __lsx_vldx(src_tmp, src_stride_3x);
1527 src_tmp += src_stride_4x;
1528 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1529 src10, src9, src76_r, src87_r, src98_r, src109_r);
1530 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1531 src10, src9, src76_l, src87_l, src98_l, src109_l);
1533 dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1534 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1535 src54_r, filt2, dst0_r, dst0_r);
1536 dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1537 dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1538 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1539 src65_r, filt2, dst1_r, dst1_r);
1540 dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1541 dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1542 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1543 src76_r, filt2, dst2_r, dst2_r);
1544 dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1545 dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1546 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1547 src87_r, filt2, dst3_r, dst3_r);
1548 dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1549 dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0);
1550 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l,
1551 src54_l, filt2, dst0_l, dst0_l);
1552 dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3);
1553 dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0);
1554 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l,
1555 src65_l, filt2, dst1_l, dst1_l);
1556 dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3);
1557 dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0);
1558 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l,
1559 src76_l, filt2, dst2_l, dst2_l);
1560 dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3);
1561 dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0);
1562 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l,
1563 src87_l, filt2, dst3_l, dst3_l);
1564 dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3);
1566 __lsx_vst(dst0_r, dst_tmp, 0);
1567 __lsx_vst(dst0_l, dst_tmp, 16);
1568 dst_tmp += dst_stride;
1569 __lsx_vst(dst1_r, dst_tmp, 0);
1570 __lsx_vst(dst1_l, dst_tmp, 16);
1571 dst_tmp += dst_stride;
1572 __lsx_vst(dst2_r, dst_tmp, 0);
1573 __lsx_vst(dst2_l, dst_tmp, 16);
1574 dst_tmp += dst_stride;
1575 __lsx_vst(dst3_r, dst_tmp, 0);
1576 __lsx_vst(dst3_l, dst_tmp, 16);
1577 dst_tmp += dst_stride;
1642 const int8_t *filter_x,
const int8_t *filter_y,
1646 int32_t src_stride_2x = (src_stride << 1);
1647 int32_t src_stride_4x = (src_stride << 2);
1648 int32_t src_stride_3x = src_stride_2x + src_stride;
1649 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1650 __m128i filt0, filt1, filt2, filt3;
1651 __m128i filt_h0, filt_h1, filt_h2, filt_h3;
1652 __m128i mask1, mask2, mask3;
1654 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1655 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1656 __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1657 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1658 __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1659 __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1664 src -= src_stride_3x + 3;
1665 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1666 filter_x, 6, filt0, filt1, filt2, filt3);
1667 filter_vec = __lsx_vld(filter_y, 0);
1668 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1670 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1671 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1672 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1673 mask3 = __lsx_vaddi_bu(mask0, 6);
1677 src3 = __lsx_vldx(
src, src_stride_3x);
1678 src += src_stride_4x;
1679 src4 = __lsx_vld(
src, 0);
1680 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
1681 src += src_stride_3x;
1684 mask2, src3,
src0, mask3, vec0, vec1, vec2, vec3);
1686 mask2, src4,
src1, mask3, vec4, vec5, vec6, vec7);
1688 mask2, src5,
src2, mask3, vec8, vec9, vec10, vec11);
1689 DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3,
1690 mask2, src6, src3, mask3, vec12, vec13, vec14, vec15);
1691 dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
1692 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
1694 dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
1695 dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
1696 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
1698 dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
1699 dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
1700 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
1702 dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
1703 dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
1704 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
1706 dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
1708 DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
1709 DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
1710 dst32_r = __lsx_vilvl_h(dst63, dst52);
1711 dst65_r = __lsx_vilvh_h(dst63, dst52);
1712 dst66 = __lsx_vreplvei_d(dst63, 1);
1714 for (loop_cnt =
height >> 2; loop_cnt--;) {
1715 src7 = __lsx_vld(
src, 0);
1716 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
1717 src10 = __lsx_vldx(
src, src_stride_3x);
1718 src += src_stride_4x;
1720 DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7,
1721 mask2, src9, src7, mask3, vec0, vec1, vec2, vec3);
1722 DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8,
1723 mask2, src10, src8, mask3, vec4, vec5, vec6, vec7);
1725 dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
1726 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
1728 dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
1729 dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
1730 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
1731 filt2, dst108, dst108);
1732 dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
1734 DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
1735 dst109_r = __lsx_vilvh_h(dst108, dst97);
1736 dst66 = __lsx_vreplvei_d(dst97, 1);
1737 dst98_r = __lsx_vilvl_h(dst66, dst108);
1739 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1740 filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
1741 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
1742 filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
1743 dst0_r, dst1_r, dst2_r, dst3_r);
1744 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
1745 filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
1746 dst0_r, dst1_r, dst2_r, dst3_r);
1747 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
1748 filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
1749 dst0_r, dst1_r, dst2_r, dst3_r);
1750 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
1751 dst0_r, dst1_r, dst2_r, dst3_r);
1752 DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1753 __lsx_vstelm_d(dst0_r,
dst, 0, 0);
1755 __lsx_vstelm_d(dst0_r,
dst, 0, 1);
1757 __lsx_vstelm_d(dst2_r,
dst, 0, 0);
1759 __lsx_vstelm_d(dst2_r,
dst, 0, 1);
1768 dst66 = __lsx_vreplvei_d(dst108, 1);
1776 const int8_t *filter_x,
1777 const int8_t *filter_y,
1781 uint32_t loop_cnt, cnt;
1782 const uint8_t *src_tmp;
1784 int32_t src_stride_2x = (src_stride << 1);
1785 int32_t src_stride_4x = (src_stride << 2);
1786 int32_t src_stride_3x = src_stride_2x + src_stride;
1787 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7;
1788 __m128i filt0, filt1, filt2, filt3;
1789 __m128i filt_h0, filt_h1, filt_h2, filt_h3;
1790 __m128i mask1, mask2, mask3;
1792 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1793 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1794 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1795 __m128i dst0_r, dst0_l;
1796 __m128i dst10_r, dst32_r, dst54_r, dst76_r;
1797 __m128i dst10_l, dst32_l, dst54_l, dst76_l;
1798 __m128i mask0 = {0x403030202010100, 0x807070606050504};
1800 src -= src_stride_3x + 3;
1801 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1802 filter_x, 6, filt0, filt1, filt2, filt3);
1804 filter_vec = __lsx_vld(filter_y, 0);
1805 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1807 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1808 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1810 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1811 mask3 = __lsx_vaddi_bu(mask0, 6);
1813 for (cnt =
width >> 3; cnt--;) {
1816 src0 = __lsx_vld(src_tmp, 0);
1817 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1819 src3 = __lsx_vldx(src_tmp, src_stride_3x);
1820 src_tmp += src_stride_4x;
1821 src4 = __lsx_vld(src_tmp, 0);
1822 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1824 src_tmp += src_stride_3x;
1828 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
1830 src1, mask2,
src1,
src1, mask3, vec4, vec5, vec6, vec7);
1832 src2, mask2,
src2,
src2, mask3, vec8, vec9, vec10, vec11);
1833 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
1834 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
1835 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1836 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1838 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1839 dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
1840 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
1842 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
1843 dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
1844 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
1846 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
1847 dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
1848 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
1850 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
1853 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
1854 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
1855 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
1856 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
1857 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
1858 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
1859 dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
1860 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
1862 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
1863 dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
1864 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
1866 dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
1867 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
1868 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
1870 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
1872 for (loop_cnt =
height; loop_cnt--;) {
1873 src7 = __lsx_vld(src_tmp, 0);
1874 src_tmp += src_stride;
1876 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
1877 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
1878 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
1879 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
1881 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
1883 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1884 dst6, dst10_r, dst32_r, dst54_r, dst76_r);
1885 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1886 dst6, dst10_l, dst32_l, dst54_l, dst76_l);
1888 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
1890 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1891 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
1892 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
1893 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
1894 dst76_l, filt_h3, dst0_r, dst0_l);
1895 dst0_r = __lsx_vsrai_w(dst0_r, 6);
1896 dst0_l = __lsx_vsrai_w(dst0_l, 6);
1898 dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
1899 __lsx_vst(dst0_r, dst_tmp, 0);
1900 dst_tmp += dst_stride;
1917 const int8_t *filter_x,
const int8_t *filter_y,
1921 filter_x, filter_y,
height, 8);
1926 const int8_t *filter_x,
const int8_t *filter_y,
1930 const uint8_t *src_tmp;
1932 int32_t src_stride_2x = (src_stride << 1);
1933 int32_t src_stride_4x = (src_stride << 2);
1934 int32_t src_stride_3x = src_stride_2x + src_stride;
1935 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1936 __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1937 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1938 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1939 __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1941 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1942 __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1943 __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1944 __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1945 __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1947 src -= src_stride_3x + 3;
1948 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1949 filter_x, 6, filt0, filt1, filt2, filt3);
1951 filter_vec = __lsx_vld(filter_y, 0);
1952 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1954 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1955 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1958 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1959 mask3 = __lsx_vaddi_bu(mask0, 6);
1964 src0 = __lsx_vld(src_tmp, 0);
1965 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1967 src3 = __lsx_vldx(src_tmp, src_stride_3x);
1968 src_tmp += src_stride_4x;
1969 src4 = __lsx_vld(src_tmp, 0);
1970 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1972 src_tmp += src_stride_3x;
1976 mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
1978 mask2,
src1,
src1, mask3, vec4, vec5, vec6, vec7);
1980 mask2,
src2,
src2, mask3, vec8, vec9, vec10, vec11);
1981 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3,
1982 mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
1983 dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1984 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1986 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1987 dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
1988 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
1990 dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
1991 dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
1992 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
1994 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
1995 dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
1996 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
1998 dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
2001 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4,
2002 mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
2003 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5,
2004 mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
2005 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6,
2006 mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
2007 dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
2008 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
2010 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
2011 dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
2012 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
2014 dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
2015 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
2016 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
2018 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
2020 for (loop_cnt =
height; loop_cnt--;) {
2021 src7 = __lsx_vld(src_tmp, 0);
2022 src_tmp += src_stride;
2024 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
2025 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
2026 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
2027 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
2029 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
2030 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
2031 dst10_r, dst32_r, dst54_r, dst76_r);
2032 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
2033 dst10_l, dst32_l, dst54_l, dst76_l);
2034 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
2036 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2037 filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2,
2038 dst0_r, dst0_l, dst0_r, dst0_l);
2039 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l,
2040 filt_h3, dst0_r, dst0_l)
2041 dst0_r = __lsx_vsrai_w(dst0_r, 6);
2042 dst0_l = __lsx_vsrai_w(dst0_l, 6);
2044 dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
2045 __lsx_vst(dst0_r, dst_tmp, 0);
2046 dst_tmp += dst_stride;
2060 DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
2061 mask7 = __lsx_vaddi_bu(mask4, 6);
2065 src3 = __lsx_vldx(
src, src_stride_3x);
2066 src += src_stride_4x;
2067 src4 = __lsx_vld(
src, 0);
2068 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
2069 src += src_stride_3x;
2072 mask6, src3,
src0, mask7, vec0, vec1, vec2, vec3);
2074 mask6, src4,
src1, mask7, vec4, vec5, vec6, vec7);
2076 mask6, src5,
src2, mask7, vec8, vec9, vec10, vec11);
2077 DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3,
2078 mask6, src6, src3, mask7, vec12, vec13, vec14, vec15);
2079 dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
2080 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
2082 dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
2083 dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
2084 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
2086 dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
2087 dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
2088 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
2090 dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
2091 dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
2092 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
2094 dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
2096 DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
2097 DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
2098 dst32_r = __lsx_vilvl_h(dst63, dst52);
2099 dst65_r = __lsx_vilvh_h(dst63, dst52);
2101 dst66 = __lsx_vreplvei_d(dst63, 1);
2103 for (loop_cnt =
height >> 2; loop_cnt--;) {
2104 src7 = __lsx_vld(
src, 0);
2105 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
2106 src10 = __lsx_vldx(
src, src_stride_3x);
2107 src += src_stride_4x;
2109 DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9,
2110 src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3);
2111 DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10,
2112 src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7);
2113 dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
2114 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
2116 dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
2117 dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
2118 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
2119 filt2, dst108, dst108);
2120 dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
2122 DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
2123 dst109_r = __lsx_vilvh_h(dst108, dst97);
2124 dst66 = __lsx_vreplvei_d(dst97, 1);
2125 dst98_r = __lsx_vilvl_h(dst66, dst108);
2127 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
2128 filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
2129 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
2130 filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
2131 dst0_r, dst1_r, dst2_r, dst3_r);
2132 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
2133 filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
2134 dst0_r, dst1_r, dst2_r, dst3_r);
2135 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
2136 filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
2137 dst0_r, dst1_r, dst2_r, dst3_r);
2138 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
2139 dst0_r, dst1_r, dst2_r, dst3_r);
2140 DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r,
2142 __lsx_vstelm_d(dst0_r,
dst, 0, 0);
2144 __lsx_vstelm_d(dst0_r,
dst, 0, 1);
2146 __lsx_vstelm_d(dst2_r,
dst, 0, 0);
2148 __lsx_vstelm_d(dst2_r,
dst, 0, 1);
2157 dst66 = __lsx_vreplvei_d(dst108, 1);
2163 const int8_t *filter_x,
const int8_t *filter_y,
2167 filter_x, filter_y,
height, 16);
2172 const int8_t *filter_x,
const int8_t *filter_y,
2176 filter_x, filter_y,
height, 24);
2181 const int8_t *filter_x,
const int8_t *filter_y,
2185 filter_x, filter_y,
height, 32);
2190 const int8_t *filter_x,
const int8_t *filter_y,
2194 filter_x, filter_y,
height, 48);
2199 const int8_t *filter_x,
const int8_t *filter_y,
2203 filter_x, filter_y,
height, 64);
2215 __m128i filt0, filt1;
2217 __m128i mask1, mask2, mask3;
2218 __m128i dst0, dst1, dst2, dst3;
2219 __m128i vec0, vec1, vec2, vec3;
2224 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
2225 mask3 = __lsx_vaddi_bu(mask0, 10);
2227 for (loop_cnt =
height; loop_cnt--;) {
2236 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
2237 vec3, filt0, dst0, dst1, dst2, dst3);
2242 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
2243 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
2244 __lsx_vst(dst0,
dst, 0);
2245 __lsx_vst(dst1,
dst, 16);
2246 __lsx_vst(dst2,
dst, 32);
2247 __lsx_vst(dst3,
dst, 48);
2260 int32_t src_stride_2x = (src_stride << 1);
2261 int32_t src_stride_3x = src_stride_2x + src_stride;
2263 __m128i src10_r, src32_r, src21_r, src43_r;
2264 __m128i src10_l, src32_l, src21_l, src43_l;
2265 __m128i dst0_r, dst1_r, dst0_l, dst1_l;
2266 __m128i filt0, filt1;
2273 src += src_stride_3x;
2277 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2278 src3 = __lsx_vld(
src, 0);
2279 src4 = __lsx_vldx(
src, src_stride);
2280 src += src_stride_2x;
2281 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
2282 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
2283 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2284 filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2285 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2286 src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
2287 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2288 __lsx_vst(dst0_r,
dst, 0);
2289 __lsx_vst(dst0_l,
dst, 16);
2291 __lsx_vst(dst1_r,
dst, 0);
2292 __lsx_vst(dst1_l,
dst, 16);
2295 src5 = __lsx_vld(
src, 0);
2296 src2 = __lsx_vldx(
src, src_stride);
2297 src += src_stride_2x;
2298 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
2299 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
2300 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2301 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2302 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
2303 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
2304 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2305 __lsx_vst(dst0_r,
dst, 0);
2306 __lsx_vst(dst0_l,
dst, 16);
2308 __lsx_vst(dst1_r,
dst, 0);
2309 __lsx_vst(dst1_l,
dst, 16);
2322 int32_t src_stride_2x = (src_stride << 1);
2323 int32_t src_stride_3x = src_stride_2x + src_stride;
2324 const uint8_t *
_src;
2327 __m128i src6, src7, src8, src9, src10, src11;
2328 __m128i src10_r, src32_r, src76_r, src98_r;
2329 __m128i src21_r, src43_r, src87_r, src109_r;
2330 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
2331 __m128i src10_l, src32_l, src21_l, src43_l;
2332 __m128i dst0_l, dst1_l;
2333 __m128i filt0, filt1;
2344 src6 = __lsx_vld(
_src, 0);
2346 src += src_stride_3x;
2347 _src += src_stride_3x;
2348 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
2350 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2353 src += src_stride_2x;
2354 _src += src_stride_2x;
2355 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
2356 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
2358 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
2359 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2360 filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2361 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2362 src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
2363 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2364 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
2366 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
2367 src109_r, filt1, dst2_r, dst3_r);
2369 __lsx_vst(dst0_r,
dst, 0);
2370 __lsx_vst(dst0_l,
dst, 16);
2371 __lsx_vst(dst2_r,
dst, 32);
2373 __lsx_vst(dst1_r,
dst, 0);
2374 __lsx_vst(dst1_l,
dst, 16);
2375 __lsx_vst(dst3_r,
dst, 32);
2380 src += src_stride_2x;
2381 _src += src_stride_2x;
2382 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
2383 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
2385 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
2387 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2388 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2389 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l,
2390 filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1,
2391 dst0_r, dst0_l, dst1_r, dst1_l);
2392 DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
2394 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r,
2395 filt1, dst2_r, dst3_r);
2397 __lsx_vst(dst0_r,
dst, 0);
2398 __lsx_vst(dst0_l,
dst, 16);
2399 __lsx_vst(dst2_r,
dst, 32);
2401 __lsx_vst(dst1_r,
dst, 0);
2402 __lsx_vst(dst1_l,
dst, 16);
2403 __lsx_vst(dst3_r,
dst, 32);
2416 int32_t src_stride_2x = (src_stride << 1);
2417 int32_t src_stride_3x = src_stride_2x + src_stride;
2418 const uint8_t *
_src;
2421 __m128i src6, src7, src8, src9, src10, src11;
2422 __m128i src10_r, src32_r, src76_r, src98_r;
2423 __m128i src21_r, src43_r, src87_r, src109_r;
2424 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
2425 __m128i src10_l, src32_l, src76_l, src98_l;
2426 __m128i src21_l, src43_l, src87_l, src109_l;
2427 __m128i dst0_l, dst1_l, dst2_l, dst3_l;
2428 __m128i filt0, filt1;
2439 src6 = __lsx_vld(
_src, 0);
2441 src += src_stride_3x;
2442 _src += src_stride_3x;
2443 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
2444 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
2446 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2449 src += src_stride_2x;
2450 _src += src_stride_2x;
2451 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
2452 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
2454 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
2455 DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
2457 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2458 filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2459 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2460 src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l,
2461 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2462 DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
2463 filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
2464 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l,
2465 filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1,
2466 dst2_r, dst2_l, dst3_r, dst3_l);
2468 __lsx_vst(dst0_r,
dst, 0);
2469 __lsx_vst(dst0_l,
dst, 16);
2470 __lsx_vst(dst2_r,
dst, 32);
2471 __lsx_vst(dst2_l,
dst, 48);
2473 __lsx_vst(dst1_r,
dst, 0);
2474 __lsx_vst(dst1_l,
dst, 16);
2475 __lsx_vst(dst3_r,
dst, 32);
2476 __lsx_vst(dst3_l,
dst, 48);
2481 src += src_stride_2x;
2482 _src += src_stride_2x;
2483 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
2484 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
2486 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
2487 DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l);
2489 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2490 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2491 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
2492 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
2493 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2495 DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r,
2496 filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
2497 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l,
2498 filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1,
2499 dst2_r, dst2_l, dst3_r, dst3_l);
2501 __lsx_vst(dst0_r,
dst, 0);
2502 __lsx_vst(dst0_l,
dst, 16);
2503 __lsx_vst(dst2_r,
dst, 32);
2504 __lsx_vst(dst2_l,
dst, 48);
2506 __lsx_vst(dst1_r,
dst, 0);
2507 __lsx_vst(dst1_l,
dst, 16);
2508 __lsx_vst(dst3_r,
dst, 32);
2509 __lsx_vst(dst3_l,
dst, 48);
2518 const int8_t *filter_x,
2519 const int8_t *filter_y)
2521 int32_t src_stride_2x = (src_stride << 1);
2522 int32_t src_stride_4x = (src_stride << 2);
2523 int32_t src_stride_3x = src_stride_2x + src_stride;
2526 __m128i filt0, filt1;
2527 __m128i filt_h0, filt_h1;
2531 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2532 __m128i dst0, dst1, dst2, dst3, dst4;
2533 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
2534 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
2535 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
2537 src -= (src_stride + 1);
2538 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2540 filter_vec = __lsx_vld(filter_y, 0);
2541 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2542 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2544 mask1 = __lsx_vaddi_bu(mask0, 2);
2548 src3 = __lsx_vldx(
src, src_stride_3x);
2549 src4 = __lsx_vldx(
src, src_stride_4x);
2554 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
2555 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
2557 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2558 vec6, filt0, dst0, dst1, dst2, dst3);
2559 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2560 dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
2561 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
2562 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
2564 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2565 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2566 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2567 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2569 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2570 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2571 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2572 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2573 dst0_r, dst0_l, dst1_r, dst1_l);
2574 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2575 dst0_r, dst0_l, dst1_r, dst1_l);
2576 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2577 __lsx_vst(dst0_r,
dst, 0);
2578 __lsx_vst(dst1_r,
dst + dst_stride, 0);
2583 const int8_t *filter_x,
2584 const int8_t *filter_y,
int32_t width8mult)
2587 int32_t src_stride_2x = (src_stride << 1);
2588 int32_t dst_stride_x = (dst_stride << 1);
2589 int32_t src_stride_4x = (src_stride << 2);
2590 int32_t dst_stride_2x = (dst_stride << 2);
2591 int32_t src_stride_3x = src_stride_2x + src_stride;
2592 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2594 __m128i
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
2595 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2596 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
2597 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
2598 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2599 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2600 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2602 src -= (src_stride + 1);
2603 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2605 filter_vec = __lsx_vld(filter_y, 0);
2606 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2607 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2610 mask1 = __lsx_vaddi_bu(mask0, 2);
2612 for (cnt = width8mult; cnt--;) {
2615 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
2616 src += src_stride_4x;
2617 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
2618 src += (8 - src_stride_4x);
2627 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2628 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2630 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2631 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2633 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2634 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2636 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
2638 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
2640 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
2642 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
2644 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2645 vec6, filt0, dst3, dst4, dst5, dst6);
2646 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
2647 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
2648 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2649 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2650 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2651 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2653 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2654 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2655 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2656 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2657 dst0_r, dst0_l, dst1_r, dst1_l);
2659 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2660 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2661 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
2662 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
2663 dst2_r, dst2_l, dst3_r, dst3_l);
2664 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2665 dst0_r, dst0_l, dst1_r, dst1_l);
2666 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2667 dst2_r, dst2_l, dst3_r, dst3_l);
2668 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
2670 DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r,
2673 __lsx_vst(dst0_r,
dst, 0);
2674 __lsx_vstx(dst1_r,
dst, dst_stride_x);
2675 __lsx_vstx(dst2_r,
dst, dst_stride_2x);
2676 __lsx_vstx(dst3_r,
dst, dst_stride_3x);
2685 const int8_t *filter_x,
2686 const int8_t *filter_y)
2688 int32_t src_stride_2x = (src_stride << 1);
2689 int32_t dst_stride_2x = (dst_stride << 1);
2690 int32_t src_stride_4x = (src_stride << 2);
2691 int32_t src_stride_3x = src_stride_2x + src_stride;
2692 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
2693 __m128i filt0, filt1;
2694 __m128i filt_h0, filt_h1;
2696 __m128i mask1, filter_vec;
2697 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2698 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
2699 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2700 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2701 __m128i dst4_r, dst4_l, dst5_r, dst5_l;
2702 __m128i dst10_r, dst32_r, dst10_l, dst32_l;
2703 __m128i dst21_r, dst43_r, dst21_l, dst43_l;
2704 __m128i dst54_r, dst54_l, dst65_r, dst65_l;
2705 __m128i dst76_r, dst76_l, dst87_r, dst87_l;
2707 src -= (src_stride + 1);
2708 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2710 filter_vec = __lsx_vld(filter_y, 0);
2711 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2712 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2714 mask1 = __lsx_vaddi_bu(mask0, 2);
2718 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
2719 src += src_stride_4x;
2721 src_stride_3x,
src, src_stride_4x, src5, src6, src7, src8);
2724 mask0,
src1,
src1, mask1, vec0, vec1, vec2, vec3);
2726 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
2727 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
2728 mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
2729 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
2730 mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
2731 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1,
2734 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
2735 filt0, dst0, dst1, dst2, dst3);
2736 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2737 dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
2738 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0,
2739 vec14, filt0, dst4, dst5, dst6, dst7);
2740 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
2741 vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
2742 dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
2743 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
2745 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
2746 dst10_r, dst21_r, dst32_r, dst43_r);
2747 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
2748 dst10_l, dst21_l, dst32_l, dst43_l);
2749 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
2750 dst54_r, dst65_r, dst76_r, dst87_r);
2751 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
2752 dst54_l, dst65_l, dst76_l, dst87_l);
2754 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2755 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2756 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2757 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2758 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
2759 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
2760 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2761 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2762 dst0_r, dst0_l, dst1_r, dst1_l);
2763 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
2764 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
2765 dst2_r, dst2_l, dst3_r, dst3_l);
2766 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
2767 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
2768 dst4_r, dst4_l, dst5_r, dst5_l);
2769 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r,
2770 dst0_l, dst1_r, dst1_l);
2771 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r,
2772 dst2_l, dst3_r, dst3_l);
2773 DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r,
2774 dst4_l, dst5_r, dst5_l);
2776 DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
2777 dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
2778 DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
2780 __lsx_vst(dst0_r,
dst, 0);
2781 __lsx_vstx(dst1_r,
dst, dst_stride_2x);
2782 dst += dst_stride_2x;
2783 __lsx_vst(dst2_r,
dst, 0);
2784 __lsx_vstx(dst3_r,
dst, dst_stride_2x);
2785 dst += dst_stride_2x;
2786 __lsx_vst(dst4_r,
dst, 0);
2787 __lsx_vstx(dst5_r,
dst, dst_stride_2x);
2794 const int8_t *filter_x,
2795 const int8_t *filter_y,
2799 uint32_t loop_cnt, cnt;
2800 const uint8_t *src_tmp;
2802 int32_t src_stride_2x = (src_stride << 1);
2803 int32_t dst_stride_x = (dst_stride << 1);
2804 int32_t src_stride_4x = (src_stride << 2);
2805 int32_t dst_stride_2x = (dst_stride << 2);
2806 int32_t src_stride_3x = src_stride_2x + src_stride;
2807 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2810 __m128i filt0, filt1;
2811 __m128i filt_h0, filt_h1;
2813 __m128i mask1, filter_vec;
2814 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2815 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
2816 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2817 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2818 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2820 src -= (src_stride + 1);
2821 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2823 filter_vec = __lsx_vld(filter_y, 0);
2824 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2825 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2827 mask1 = __lsx_vaddi_bu(mask0, 2);
2829 for (cnt = width8mult; cnt--;) {
2833 src0 = __lsx_vld(src_tmp, 0);
2834 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
2836 src_tmp += src_stride_3x;
2845 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2846 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2848 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2849 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2851 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2852 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2854 for (loop_cnt =
height >> 2; loop_cnt--;) {
2855 src3 = __lsx_vld(src_tmp, 0);
2856 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
2858 src6 = __lsx_vldx(src_tmp, src_stride_3x);
2859 src_tmp += src_stride_4x;
2861 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
2863 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
2865 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
2867 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
2870 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2871 vec6, filt0, dst3, dst4, dst5, dst6);
2872 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
2873 filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
2876 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2877 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2878 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2879 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2881 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2882 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2883 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2884 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2885 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
2886 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
2887 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
2888 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
2889 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
2890 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
2892 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2893 dst0_r, dst0_l, dst1_r, dst1_l);
2894 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2895 dst2_r, dst2_l, dst3_r, dst3_l);
2897 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
2898 dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
2900 __lsx_vst(dst0_r, dst_tmp, 0);
2901 __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
2902 __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
2903 __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
2904 dst_tmp += dst_stride_2x;
2921 const int8_t *filter_x,
2922 const int8_t *filter_y,
2928 filter_x, filter_y);
2929 }
else if (4 ==
height) {
2931 filter_x, filter_y, 1);
2932 }
else if (6 ==
height) {
2934 filter_x, filter_y);
2935 }
else if (0 == (
height & 0x03)) {
2937 filter_x, filter_y,
height, 1);
2945 const int8_t *filter_x,
2946 const int8_t *filter_y,
2950 const uint8_t *src_tmp;
2952 int32_t src_stride_2x = (src_stride << 1);
2953 int32_t dst_stride_x = (dst_stride << 1);
2954 int32_t src_stride_4x = (src_stride << 2);
2955 int32_t dst_stride_2x = (dst_stride << 2);
2956 int32_t src_stride_3x = src_stride_2x + src_stride;
2957 int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2960 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2961 __m128i mask0, mask1;
2962 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0;
2963 __m128i dst1, dst2, dst3, dst4, dst5, dst6;
2964 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2965 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2966 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2968 src -= (src_stride + 1);
2969 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2971 filter_vec = __lsx_vld(filter_y, 0);
2972 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2973 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2976 mask1 = __lsx_vaddi_bu(mask0, 2);
2981 src0 = __lsx_vld(src_tmp, 0);
2983 src_tmp += src_stride_3x;
2989 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2990 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2991 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst0, dst1);
2992 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2994 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2995 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2997 for (loop_cnt =
height >> 2; loop_cnt--;) {
2998 src3 = __lsx_vld(src_tmp, 0);
2999 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, src4, src5);
3000 src6 = __lsx_vldx(src_tmp, src_stride_3x);
3001 src_tmp += src_stride_4x;
3003 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
3004 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
3005 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
3006 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
3008 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
3009 vec6, filt0, dst3, dst4, dst5, dst6);
3010 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
3011 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
3013 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
3014 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
3015 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
3016 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
3018 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
3019 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
3020 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
3021 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
3022 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
3023 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
3024 dst0_r, dst0_l, dst1_r, dst1_l);
3025 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
3026 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
3027 dst2_r, dst2_l, dst3_r, dst3_l);
3028 DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst0_r, dst1_r);
3029 DUP2_ARG3(__lsx_vsrani_h_w, dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, dst2_r, dst3_r);
3030 __lsx_vst(dst0_r, dst_tmp, 0);
3031 __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
3032 __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
3033 __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
3034 dst_tmp += dst_stride_2x;
3046 src0 = __lsx_vld(src_tmp, 0);
3048 src_tmp += src_stride_3x;
3054 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
3055 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
3056 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst0, dst1);
3057 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
3059 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3060 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
3062 for (loop_cnt =
height >> 2; loop_cnt--;) {
3063 src3 = __lsx_vld(src_tmp, 0);
3064 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, src4, src5);
3065 src6 = __lsx_vldx(src_tmp, src_stride_3x);
3066 src_tmp += src_stride_4x;
3068 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
3069 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
3070 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
3071 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
3073 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
3074 vec6, filt0, dst3, dst4, dst5, dst6);
3075 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
3076 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
3078 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
3079 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
3080 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
3081 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
3083 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
3084 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
3085 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
3086 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
3087 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
3088 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
3089 dst0_r, dst0_l, dst1_r, dst1_l);
3090 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
3091 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
3092 dst2_r, dst2_l, dst3_r, dst3_l);
3093 DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst0_r, dst1_r);
3094 DUP2_ARG3(__lsx_vsrani_h_w, dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, dst2_r, dst3_r);
3095 __lsx_vstelm_d(dst0_r, dst_tmp, 0, 0);
3096 dst_tmp += dst_stride;
3097 __lsx_vstelm_d(dst1_r, dst_tmp, 0, 0);
3098 dst_tmp += dst_stride;
3099 __lsx_vstelm_d(dst2_r, dst_tmp, 0, 0);
3100 dst_tmp += dst_stride;
3101 __lsx_vstelm_d(dst3_r, dst_tmp, 0, 0);
3102 dst_tmp += dst_stride;
3117 const int8_t *filter_x,
3118 const int8_t *filter_y,
3123 filter_x, filter_y, 2);
3126 filter_x, filter_y,
height, 2);
3134 const int8_t *filter_x,
3135 const int8_t *filter_y,
3139 filter_x, filter_y,
height, 3);
3146 const int8_t *filter_x,
3147 const int8_t *filter_y,
3151 filter_x, filter_y,
height, 4);
3154 #define MC_COPY(WIDTH) \
3155 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst, \
3156 const uint8_t *src, \
3157 ptrdiff_t src_stride, \
3163 hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height); \
3178 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3179 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \
3180 const uint8_t *src, \
3181 ptrdiff_t src_stride, \
3187 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
3189 hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, \
3190 MAX_PB_SIZE, filter, height); \
3193 MC(qpel,
h, 4, 8, hz,
mx);
3194 MC(qpel,
h, 8, 8, hz,
mx);
3195 MC(qpel,
h, 12, 8, hz,
mx);
3196 MC(qpel,
h, 16, 8, hz,
mx);
3197 MC(qpel,
h, 24, 8, hz,
mx);
3198 MC(qpel,
h, 32, 8, hz,
mx);
3199 MC(qpel,
h, 48, 8, hz,
mx);
3200 MC(qpel,
h, 64, 8, hz,
mx);
3202 MC(qpel, v, 4, 8, vt,
my);
3203 MC(qpel, v, 8, 8, vt,
my);
3204 MC(qpel, v, 12, 8, vt,
my);
3205 MC(qpel, v, 16, 8, vt,
my);
3206 MC(qpel, v, 24, 8, vt,
my);
3207 MC(qpel, v, 32, 8, vt,
my);
3208 MC(qpel, v, 48, 8, vt,
my);
3209 MC(qpel, v, 64, 8, vt,
my);
3211 MC(epel,
h, 32, 4, hz,
mx);
3213 MC(epel, v, 16, 4, vt,
my);
3214 MC(epel, v, 24, 4, vt,
my);
3215 MC(epel, v, 32, 4, vt,
my);
3219 #define MC_HV(PEL, WIDTH, TAP) \
3220 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst, \
3221 const uint8_t *src, \
3222 ptrdiff_t src_stride, \
3228 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
3229 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
3231 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, \
3232 filter_x, filter_y, height); \