27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
105 uint8_t *dst,
int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(
src, src_stride, out0, out1);
117 LD4(
src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(
src, src_stride, out0, out1);
125 }
else if (0 == (
height % 8)) {
126 for (cnt = (
height >> 3); cnt--;) {
127 LD4(
src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(
src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 }
else if (0 == (
height % 4)) {
137 for (cnt = (
height >> 2); cnt--;) {
138 LD4(
src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
147 uint8_t *dst,
int32_t dst_stride,
150 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
153 src += (8 * src_stride);
155 dst += (8 * dst_stride);
161 uint8_t *dst,
int32_t dst_stride,
165 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
169 src += (8 * src_stride);
170 ST_UB8(
src0,
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
173 src += (4 * src_stride);
175 dst += (4 * dst_stride);
176 }
else if (0 == (
height % 8)) {
177 for (cnt = (
height >> 3); cnt--;) {
180 src += (8 * src_stride);
183 dst += (8 * dst_stride);
185 }
else if (0 == (
height % 4)) {
186 for (cnt = (
height >> 2); cnt--;) {
188 src += (4 * src_stride);
191 dst += (4 * dst_stride);
197 uint8_t *dst,
int32_t dst_stride,
201 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
206 LD4(
src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(
src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(
src0,
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
220 uint8_t *dst,
int32_t dst_stride,
224 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (
height >> 2); cnt--;) {
228 LD_UB4(
src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
237 uint8_t *dst,
int32_t dst_stride,
241 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (
height >> 2); cnt--;) {
246 LD_UB4(
src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(
src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
258 uint8_t *dst,
int32_t dst_stride,
262 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (
height >> 2); cnt--;) {
270 LD_UB4(
src, 16, src8, src9, src10, src11);
272 LD_UB4(
src, 16, src12, src13, src14, src15);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
287 uint8_t *dst,
int32_t dst_stride,
290 v16u8 mask0, mask1, mask2, mask3,
out;
291 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16
filt, out0, out1;
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
312 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
316 uint8_t *dst,
int32_t dst_stride,
319 v16i8 filt0, filt1, filt2, filt3;
321 v16u8 mask0, mask1, mask2, mask3,
out;
322 v8i16
filt, out0, out1, out2, out3;
337 src += (4 * src_stride);
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
347 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
349 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
353 uint8_t *dst,
int32_t dst_stride,
356 v16u8 mask0, mask1, mask2, mask3,
out;
357 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
358 v8i16
filt, out0, out1, out2, out3;
373 src += (4 * src_stride);
375 mask3, filt0, filt1, filt2, filt3, out0, out1);
378 src += (4 * src_stride);
380 mask3, filt0, filt1, filt2, filt3, out2, out3);
384 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
386 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387 dst += (8 * dst_stride);
391 src += (4 * src_stride);
393 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 src += (4 * src_stride);
398 mask3, filt0, filt1, filt2, filt3, out2, out3);
403 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
405 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
409 uint8_t *dst,
int32_t dst_stride,
416 }
else if (16 ==
height) {
422 uint8_t *dst,
int32_t dst_stride,
426 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429 v8i16
filt, out0, out1, out2, out3;
442 for (loop_cnt = (
height >> 2); loop_cnt--;) {
445 src += (4 * src_stride);
448 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450 out0, out1, out2, out3);
452 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454 out0, out1, out2, out3);
456 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458 out0, out1, out2, out3);
460 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462 out0, out1, out2, out3);
468 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469 dst += (4 * dst_stride);
474 uint8_t *dst,
int32_t dst_stride,
478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479 v16u8 tmp0, tmp1, tmp2;
480 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482 v16i8 filt0, filt1, filt2, filt3;
483 v8i16
filt, out0, out1, out2, out3, out4, out5;
501 for (loop_cnt = 4; loop_cnt--;) {
505 LD_SB4(
src + 8, src_stride, src4, src5, src6, src7);
509 src += (4 * src_stride);
512 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
516 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
520 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
524 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
546 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548 dst += (4 * dst_stride);
553 uint8_t *dst,
int32_t dst_stride,
557 v16u8 mask0, mask1, mask2, mask3,
out;
558 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
559 v16i8 filt0, filt1, filt2, filt3;
560 v8i16
filt, out0, out1, out2, out3;
573 for (loop_cnt = (
height >> 2); loop_cnt--;) {
576 src += (2 * src_stride);
580 src += (2 * src_stride);
585 mask3, filt0, filt1, filt2, filt3, out0,
597 mask3, filt0, filt1, filt2, filt3, out0,
611 uint8_t *dst,
int32_t dst_stride,
615 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
619 v8i16 out0, out1, out2, out3, out8, out9,
filt;
636 for (loop_cnt = 16; loop_cnt--;) {
640 src += (2 * src_stride);
642 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
648 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651 out0, out8, out2, out9);
654 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657 out0, out8, out2, out9);
660 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663 out0, out8, out2, out9);
670 ST_D2(
out, 0, 1, dst + 16, dst_stride);
681 uint8_t *dst,
int32_t dst_stride,
685 v16u8 mask0, mask1, mask2, mask3,
out;
686 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
687 v16i8 filt0, filt1, filt2, filt3;
688 v8i16
filt, out0, out1, out2, out3;
701 for (loop_cnt = (
height >> 1); loop_cnt--;) {
717 mask3, filt0, filt1, filt2, filt3, out0,
729 mask3, filt0, filt1, filt2, filt3, out0,
742 uint8_t *dst,
int32_t dst_stride,
746 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
749 v8i16
filt, out0, out1, out2, out3;
766 for (loop_cnt = 64; loop_cnt--;) {
775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
792 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
795 out3 = __msa_srari_h(out2, 6);
800 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
806 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
810 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
817 out2 = __msa_srari_h(out2, 6);
828 uint8_t *dst,
int32_t dst_stride,
832 v16u8 mask0, mask1, mask2, mask3,
out;
833 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835 v16i8 filt0, filt1, filt2, filt3;
836 v8i16 res0, res1, res2, res3,
filt;
849 for (loop_cnt =
height; loop_cnt--;) {
856 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
860 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
864 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
868 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
907 uint8_t *dst,
int32_t dst_stride,
912 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913 v16i8 src11, src12, src13, src14;
914 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917 v16i8 src10998, filt0, filt1, filt2, filt3;
918 v8i16
filt, out10, out32, out54, out76;
920 src -= (3 * src_stride);
926 src += (7 * src_stride);
928 ILVR_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_r, src32_r,
930 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
935 for (loop_cnt = (
height >> 3); loop_cnt--;) {
936 LD_SB4(
src, src_stride, src7, src8, src9, src10);
937 src += (4 * src_stride);
938 LD_SB4(
src, src_stride, src11, src12, src13, src14);
939 src += (4 * src_stride);
941 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942 src87_r, src98_r, src109_r);
943 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944 src1110_r, src1211_r, src1312_r, src1413_r);
945 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947 src12111110, src14131312);
951 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
965 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966 dst += (8 * dst_stride);
969 src4332 = src12111110;
970 src6554 = src14131312;
976 uint8_t *dst,
int32_t dst_stride,
980 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
984 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
986 src -= (3 * src_stride);
993 src += (7 * src_stride);
994 ILVR_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_r, src32_r,
996 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
998 for (loop_cnt = (
height >> 2); loop_cnt--;) {
999 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1001 src += (4 * src_stride);
1003 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004 src87_r, src98_r, src109_r);
1005 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006 filt0, out0_r, out1_r, out2_r, out3_r);
1007 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008 filt1, out0_r, out1_r, out2_r, out3_r);
1009 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010 filt2, out0_r, out1_r, out2_r, out3_r);
1011 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012 filt3, out0_r, out1_r, out2_r, out3_r);
1014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1017 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018 dst += (4 * dst_stride);
1031 uint8_t *dst,
int32_t dst_stride,
1035 uint32_t out2, out3;
1036 uint64_t out0, out1;
1037 v16u8 tmp0, tmp1, tmp2, tmp3;
1038 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039 v16i8 filt0, filt1, filt2, filt3;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1045 src -= (3 * src_stride);
1051 src += (7 * src_stride);
1055 ILVR_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_r, src32_r,
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058 ILVL_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_l, src32_l,
1060 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1062 for (loop_cnt = 4; loop_cnt--;) {
1063 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1065 src += (4 * src_stride);
1067 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068 src87_r, src98_r, src109_r);
1069 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070 src87_l, src98_l, src109_l);
1072 filt1, filt2, filt3);
1074 filt1, filt2, filt3);
1076 filt1, filt2, filt3);
1078 filt1, filt2, filt3);
1080 filt1, filt2, filt3);
1082 filt1, filt2, filt3);
1084 filt1, filt2, filt3);
1086 filt1, filt2, filt3);
1089 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092 out3_r, tmp0, tmp1, tmp2, tmp3);
1095 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1100 SW(out2, (dst + 8));
1103 SW(out3, (dst + 8));
1105 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1110 SW(out2, (dst + 8));
1113 SW(out3, (dst + 8));
1133 uint8_t *dst,
int32_t dst_stride,
1137 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138 v16i8 filt0, filt1, filt2, filt3;
1139 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142 v16u8 tmp0, tmp1, tmp2, tmp3;
1143 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1145 src -= (3 * src_stride);
1152 src += (7 * src_stride);
1153 ILVR_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_r, src32_r,
1155 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156 ILVL_B4_SB(
src1,
src0, src3, src2, src5, src4, src2,
src1, src10_l, src32_l,
1158 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1160 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1161 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1163 src += (4 * src_stride);
1165 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166 src87_r, src98_r, src109_r);
1167 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168 src87_l, src98_l, src109_l);
1170 filt1, filt2, filt3);
1172 filt1, filt2, filt3);
1174 filt1, filt2, filt3);
1176 filt1, filt2, filt3);
1178 filt1, filt2, filt3);
1180 filt1, filt2, filt3);
1182 filt1, filt2, filt3);
1184 filt1, filt2, filt3);
1187 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190 out3_r, tmp0, tmp1, tmp2, tmp3);
1192 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193 dst += (4 * dst_stride);
1212 uint8_t *dst,
int32_t dst_stride,
1218 uint32_t loop_cnt, cnt;
1219 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220 v16i8 filt0, filt1, filt2, filt3;
1221 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224 v16u8 tmp0, tmp1, tmp2, tmp3;
1225 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1227 src -= (3 * src_stride);
1232 for (cnt = (
width >> 4); cnt--;) {
1236 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1238 src_tmp += (7 * src_stride);
1240 src32_r, src54_r, src21_r);
1241 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1243 src32_l, src54_l, src21_l);
1244 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1246 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1247 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1249 src_tmp += (4 * src_stride);
1250 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251 src87_r, src98_r, src109_r);
1252 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253 src87_l, src98_l, src109_l);
1255 filt0, filt1, filt2, filt3);
1257 filt0, filt1, filt2, filt3);
1259 filt0, filt1, filt2, filt3);
1261 filt0, filt1, filt2, filt3);
1263 filt0, filt1, filt2, filt3);
1265 filt0, filt1, filt2, filt3);
1267 filt0, filt1, filt2, filt3);
1269 filt0, filt1, filt2, filt3);
1272 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275 out3_r, tmp0, tmp1, tmp2, tmp3);
1277 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278 dst_tmp += (4 * dst_stride);
1301 uint8_t *dst,
int32_t dst_stride,
1312 uint8_t *dst,
int32_t dst_stride,
1320 uint8_t *dst,
int32_t dst_stride,
1328 uint8_t *dst,
int32_t dst_stride,
1339 const int8_t *filter_x,
1340 const int8_t *filter_y,
1345 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 src9, src10, src11, src12, src13, src14;
1347 v8i16 filt0, filt1, filt2, filt3;
1348 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349 v16i8 mask1, mask2, mask3;
1351 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1359 src -= ((3 * src_stride) + 3);
1360 filter_vec =
LD_SH(filter_x);
1361 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1363 filter_vec =
LD_SH(filter_y);
1366 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1373 src += (7 * src_stride);
1376 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379 vec8, vec9, vec10, vec11);
1380 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381 vec12, vec13, vec14, vec15);
1396 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1398 for (loop_cnt =
height >> 3; loop_cnt--;) {
1399 LD_SB8(
src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1401 src += (8 * src_stride);
1402 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1404 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405 vec0, vec1, vec2, vec3);
1406 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407 vec4, vec5, vec6, vec7);
1408 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409 vec8, vec9, vec10, vec11);
1410 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411 vec12, vec13, vec14, vec15);
1422 dst76_r = __msa_ilvr_h(dst117, dst66);
1425 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1429 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430 filt_h1, filt_h2, filt_h3);
1431 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432 filt_h1, filt_h2, filt_h3);
1433 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434 filt_h1, filt_h2, filt_h3);
1435 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436 filt_h1, filt_h2, filt_h3);
1437 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438 filt_h1, filt_h2, filt_h3);
1439 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440 filt_h1, filt_h2, filt_h3);
1441 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442 filt_h1, filt_h2, filt_h3);
1444 filt_h0, filt_h1, filt_h2, filt_h3);
1446 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1456 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457 dst += (8 * dst_stride);
1460 dst32_r = dst1110_r;
1461 dst54_r = dst1312_r;
1463 dst43_r = dst1211_r;
1464 dst65_r = dst1413_r;
1465 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1473 const int8_t *filter_x,
1474 const int8_t *filter_y,
1477 uint32_t loop_cnt, cnt;
1481 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1482 v8i16 filt0, filt1, filt2, filt3;
1483 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484 v16i8 mask1, mask2, mask3;
1486 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1496 src -= ((3 * src_stride) + 3);
1498 filter_vec =
LD_SH(filter_x);
1499 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1501 filter_vec =
LD_SH(filter_y);
1504 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1510 for (cnt =
width >> 3; cnt--;) {
1514 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1515 src_tmp += (7 * src_stride);
1520 vec0, vec1, vec2, vec3);
1522 vec4, vec5, vec6, vec7);
1523 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524 vec8, vec9, vec10, vec11);
1525 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526 vec12, vec13, vec14, vec15);
1536 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537 vec0, vec1, vec2, vec3);
1538 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539 vec4, vec5, vec6, vec7);
1540 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541 vec8, vec9, vec10, vec11);
1549 for (loop_cnt =
height >> 1; loop_cnt--;) {
1550 LD_SB2(src_tmp, src_stride, src7, src8);
1552 src_tmp += 2 * src_stride;
1554 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555 dst10_r, dst32_r, dst54_r, dst21_r);
1556 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557 dst10_l, dst32_l, dst54_l, dst21_l);
1558 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1561 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562 vec0, vec1, vec2, vec3);
1568 filt_h0, filt_h1, filt_h2, filt_h3);
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1574 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575 vec0, vec1, vec2, vec3);
1581 filt_h0, filt_h1, filt_h2, filt_h3);
1583 filt_h0, filt_h1, filt_h2, filt_h3);
1587 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1589 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1591 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
1592 dst_tmp += (2 * dst_stride);
1612 const int8_t *filter_x,
1613 const int8_t *filter_y,
1617 filter_x, filter_y,
height, 8);
1624 const int8_t *filter_x,
1625 const int8_t *filter_y,
1629 uint8_t *src_tmp, *dst_tmp;
1631 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632 v16i8 src11, src12, src13, src14;
1633 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642 v8i16 dst1413_r, dst87_l, filter_vec;
1643 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644 v4i32 dst0_l, dst1_l;
1646 src -= ((3 * src_stride) + 3);
1648 filter_vec =
LD_SH(filter_x);
1649 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1651 filter_vec =
LD_SH(filter_y);
1654 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1664 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1665 src_tmp += (7 * src_stride);
1671 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1684 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1695 for (loop_cnt = 8; loop_cnt--;) {
1696 LD_SB2(src_tmp, src_stride, src7, src8);
1698 src_tmp += 2 * src_stride;
1700 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701 dst32_r, dst54_r, dst21_r);
1702 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703 dst32_l, dst54_l, dst21_l);
1704 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1707 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1714 filt_h0, filt_h1, filt_h2, filt_h3);
1716 filt_h0, filt_h1, filt_h2, filt_h3);
1720 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1727 filt_h0, filt_h1, filt_h2, filt_h3);
1729 filt_h0, filt_h1, filt_h2, filt_h3);
1733 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1735 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1737 ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738 dst_tmp += (2 * dst_stride);
1758 src += (7 * src_stride);
1761 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1765 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1781 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1783 for (loop_cnt = 2; loop_cnt--;) {
1784 LD_SB8(
src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1786 src += (8 * src_stride);
1787 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1789 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1791 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1793 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1795 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1807 dst76_r = __msa_ilvr_h(dst117, dst66);
1810 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1814 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815 filt_h1, filt_h2, filt_h3);
1816 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817 filt_h1, filt_h2, filt_h3);
1818 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819 filt_h1, filt_h2, filt_h3);
1820 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821 filt_h1, filt_h2, filt_h3);
1822 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823 filt_h1, filt_h2, filt_h3);
1824 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825 filt_h1, filt_h2, filt_h3);
1826 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827 filt_h1, filt_h2, filt_h3);
1829 filt_h0, filt_h1, filt_h2, filt_h3);
1831 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1841 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842 dst += (8 * dst_stride);
1845 dst32_r = dst1110_r;
1846 dst54_r = dst1312_r;
1848 dst43_r = dst1211_r;
1849 dst65_r = dst1413_r;
1850 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1858 const int8_t *filter_x,
1859 const int8_t *filter_y,
1863 filter_x, filter_y,
height, 16);
1870 const int8_t *filter_x,
1871 const int8_t *filter_y,
1875 filter_x, filter_y,
height, 24);
1882 const int8_t *filter_x,
1883 const int8_t *filter_y,
1887 filter_x, filter_y,
height, 32);
1894 const int8_t *filter_x,
1895 const int8_t *filter_y,
1899 filter_x, filter_y,
height, 48);
1906 const int8_t *filter_x,
1907 const int8_t *filter_y,
1911 filter_x, filter_y,
height, 64);
1915 uint8_t *dst,
int32_t dst_stride,
1918 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
1935 res0 = __msa_srari_h(res0, 6);
1936 res0 = __msa_sat_s_h(res0, 7);
1942 uint8_t *dst,
int32_t dst_stride,
1945 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1946 v8i16
filt, out0, out1;
1961 filt0, filt1, out0, out1);
1965 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
1969 uint8_t *dst,
int32_t dst_stride,
1972 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1974 v8i16
filt, out0, out1, out2, out3;
1986 src += (4 * src_stride);
1990 filt0, filt1, out0, out1);
1994 filt0, filt1, out2, out3);
1998 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2000 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2004 uint8_t *dst,
int32_t dst_stride,
2007 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2008 v16i8 filt0, filt1, mask0, mask1;
2010 v8i16
filt, out0, out1, out2, out3;
2022 src += (8 * src_stride);
2025 filt0, filt1, out0, out1);
2027 filt0, filt1, out2, out3);
2031 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2033 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034 dst += (8 * dst_stride);
2037 src += (8 * src_stride);
2040 filt0, filt1, out0, out1);
2042 filt0, filt1, out2, out3);
2046 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2048 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2052 uint8_t *dst,
int32_t dst_stride,
2057 }
else if (4 ==
height) {
2059 }
else if (8 ==
height) {
2061 }
else if (16 ==
height) {
2067 uint8_t *dst,
int32_t dst_stride,
2070 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2072 v8i16
filt, out0, out1, out2, out3;
2084 src += (4 * src_stride);
2088 filt1, out0, out1, out2, out3);
2093 ST_W2(out4, 0, 2, dst, dst_stride);
2094 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097 dst += (4 * dst_stride);
2100 src += (4 * src_stride);
2104 filt1, out0, out1, out2, out3);
2109 ST_W2(out4, 0, 2, dst, dst_stride);
2110 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2116 uint8_t *dst,
int32_t dst_stride,
2120 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
2122 v8i16
filt, vec0, vec1, vec2, vec3;
2132 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2134 src += (2 * src_stride);
2138 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2145 dst += (2 * dst_stride);
2150 uint8_t *dst,
int32_t dst_stride,
2154 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2156 v8i16
filt, out0, out1, out2, out3;
2167 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2169 src += (4 * src_stride);
2173 filt1, out0, out1, out2, out3);
2178 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179 dst += (4 * dst_stride);
2184 uint8_t *dst,
int32_t dst_stride,
2197 uint8_t *dst,
int32_t dst_stride,
2201 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2205 v8i16
filt, out0, out1, out2, out3, out4, out5;
2219 for (loop_cnt = 4; loop_cnt--;) {
2221 src += (4 * src_stride);
2225 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2231 ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2234 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236 out2, out3, out4, out5);
2238 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240 out2, out3, out4, out5);
2245 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246 dst += (4 * dst_stride);
2251 uint8_t *dst,
int32_t dst_stride,
2255 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2256 v16i8 filt0, filt1, mask0, mask1;
2257 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2270 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2273 src += (4 * src_stride);
2278 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280 out0, out1, out2, out3);
2282 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284 out0, out1, out2, out3);
2294 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297 out4, out5, out6, out7);
2298 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301 out4, out5, out6, out7);
2314 uint8_t *dst,
int32_t dst_stride,
2317 uint8_t *dst1 = dst + 16;
2319 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2320 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322 v8i16
filt, out0, out1, out2, out3;
2334 mask11 = mask0 + 10;
2336 for (loop_cnt = 8; loop_cnt--;) {
2339 src += (4 * src_stride);
2343 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2345 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347 out0, out1, out2, out3);
2348 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349 out0, out1, out2, out3);
2359 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364 out0, out1, out2, out3);
2365 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366 out0, out1, out2, out3);
2378 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2380 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2382 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383 out0, out1, out2, out3);
2384 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385 out0, out1, out2, out3);
2391 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392 dst1 += (4 * dst_stride);
2397 uint8_t *dst,
int32_t dst_stride,
2401 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2402 v16i8 filt0, filt1, mask0, mask1;
2404 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2416 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433 out0, out1, out2, out3);
2435 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437 out0, out1, out2, out3);
2439 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442 out4, out5, out6, out7);
2443 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446 out4, out5, out6, out7);
2465 uint8_t *dst,
int32_t dst_stride,
2468 v16i8
src0,
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469 v16i8 src2110, src4332, filt0, filt1;
2479 src += (3 * src_stride);
2482 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2485 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2489 out10 = __msa_srari_h(out10, 6);
2490 out10 = __msa_sat_s_h(out10, 7);
2496 uint8_t *dst,
int32_t dst_stride,
2500 v16i8
src0,
src1, src2, src3, src4, src5;
2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502 v16i8 src2110, src4332, filt0, filt1;
2503 v8i16
filt, out10, out32;
2512 src += (3 * src_stride);
2516 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2519 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2520 LD_SB3(
src, src_stride, src3, src4, src5);
2521 src += (3 * src_stride);
2522 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2528 src += (src_stride);
2529 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2536 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2537 dst += (4 * dst_stride);
2542 uint8_t *dst,
int32_t dst_stride,
2554 uint8_t *dst,
int32_t dst_stride,
2558 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2559 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2568 src += (3 * src_stride);
2573 src += (2 * src_stride);
2575 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2581 src += (2 * src_stride);
2583 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2589 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2592 ST_W2(out0, 0, 2, dst, dst_stride);
2593 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596 dst += (4 * dst_stride);
2599 src += (2 * src_stride);
2601 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2607 src += (2 * src_stride);
2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2618 ST_W2(out0, 0, 2, dst, dst_stride);
2619 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2625 uint8_t *dst,
int32_t dst_stride,
2628 v16i8
src0,
src1, src2, src3, src4;
2629 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2651 uint8_t *dst,
int32_t dst_stride,
2655 uint64_t out0, out1, out2;
2656 v16i8
src0,
src1, src2, src3, src4, src5;
2657 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658 v8i16
filt, filt0, filt1;
2667 src += (3 * src_stride);
2672 for (loop_cnt = 2; loop_cnt--;) {
2673 LD_SB3(
src, src_stride, src3, src4, src5);
2674 src += (3 * src_stride);
2677 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2682 tmp2 = __msa_srari_h(tmp2, 6);
2687 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2704 uint8_t *dst,
int32_t dst_stride,
2708 v16i8
src0,
src1, src2, src7, src8, src9, src10;
2709 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2719 src += (3 * src_stride);
2724 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2725 LD_SB4(
src, src_stride, src7, src8, src9, src10);
2726 src += (4 * src_stride);
2729 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730 src72_r, src87_r, src98_r, src109_r);
2736 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2739 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740 dst += (4 * dst_stride);
2749 uint8_t *dst,
int32_t dst_stride,
2754 }
else if (6 ==
height) {
2763 uint8_t *dst,
int32_t dst_stride,
2767 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2769 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771 v16i8 src2110, src4332, src6554;
2772 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2775 src -= (1 * src_stride);
2781 src += (3 * src_stride);
2786 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 for (loop_cnt = 4; loop_cnt--;) {
2789 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2790 src += (4 * src_stride);
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2809 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2813 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2815 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816 dst += (4 * dst_stride);
2826 uint8_t *dst,
int32_t dst_stride,
2830 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2831 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833 v16u8 tmp0, tmp1, tmp2, tmp3;
2834 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2842 src += (3 * src_stride);
2848 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2849 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2850 src += (4 * src_stride);
2853 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854 src32_r, src43_r, src54_r, src65_r);
2855 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856 src32_l, src43_l, src54_l, src65_l);
2867 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870 out3_r, tmp0, tmp1, tmp2, tmp3);
2872 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873 dst += (4 * dst_stride);
2884 uint8_t *dst,
int32_t dst_stride,
2888 uint64_t out0, out1;
2889 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890 v16i8 src11, filt0, filt1;
2891 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2908 LD_SB3(
src + 16, src_stride, src6, src7, src8);
2909 src += (3 * src_stride);
2911 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 for (loop_cnt = 8; loop_cnt--;) {
2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2921 LD_SB2(
src + 16, src_stride, src9, src10);
2922 src += (2 * src_stride);
2924 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2939 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2943 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2961 LD_SB2(
src + 16, src_stride, src11, src8);
2962 src += (2 * src_stride);
2964 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2979 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2995 uint8_t *dst,
int32_t dst_stride,
2999 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000 v16i8 src10_r, src32_r, src76_r, src98_r;
3001 v16i8 src21_r, src43_r, src87_r, src109_r;
3002 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003 v16i8 src10_l, src32_l, src76_l, src98_l;
3004 v16i8 src21_l, src43_l, src87_l, src109_l;
3022 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3023 src += (3 * src_stride);
3026 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 for (loop_cnt = (
height >> 1); loop_cnt--;) {
3033 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3044 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3057 LD_SB2(
src + 16, src_stride, src9, src10);
3058 src += (2 * src_stride);
3060 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3071 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3077 dst += 2 * dst_stride;
3091 const int8_t *filter_x,
3092 const int8_t *filter_y)
3095 v16i8
src0,
src1, src2, src3, src4;
3097 v8i16 filt_h0, filt_h1;
3100 v8i16 filter_vec,
tmp;
3101 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3105 src -= (src_stride + 1);
3107 filter_vec =
LD_SH(filter_x);
3110 filter_vec =
LD_SH(filter_y);
3122 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3135 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136 tmp = __msa_srari_h(
tmp, 6);
3137 tmp = __msa_sat_s_h(
tmp, 7);
3146 const int8_t *filter_x,
3147 const int8_t *filter_y)
3150 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3152 v8i16 filt_h0, filt_h1;
3155 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156 v8i16 filter_vec, tmp0, tmp1;
3157 v8i16 dst30, dst41, dst52, dst63;
3158 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159 v4i32 dst0, dst1, dst2, dst3;
3161 src -= (src_stride + 1);
3163 filter_vec =
LD_SH(filter_x);
3166 filter_vec =
LD_SH(filter_y);
3178 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3193 SRA_4V(dst0, dst1, dst2, dst3, 6);
3198 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
3205 const int8_t *filter_x,
3206 const int8_t *filter_y,
3211 v16i8
src0,
src1, src2, src3, src4, src5;
3212 v16i8 src6, src7, src8, src9, src10;
3214 v8i16 filt_h0, filt_h1;
3217 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223 v8i16 dst98_r, dst109_r;
3225 src -= (src_stride + 1);
3227 filter_vec =
LD_SH(filter_x);
3230 filter_vec =
LD_SH(filter_y);
3238 src += (3 * src_stride);
3247 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 for (loop_cnt =
height >> 3; loop_cnt--;) {
3251 src3, src4, src5, src6, src7, src8, src9, src10);
3252 src += (8 * src_stride);
3256 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3266 dst32_r = __msa_ilvr_h(dst73, dst22);
3270 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271 dst76_r = __msa_ilvr_h(dst22, dst106);
3281 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284 dst5_r, dst4_r, dst7_r, dst6_r,
3285 tmp0, tmp1, tmp2, tmp3);
3290 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291 dst += (8 * dst_stride);
3295 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3303 const int8_t *filter_x,
3304 const int8_t *filter_y,
3309 filter_x, filter_y);
3310 }
else if (4 ==
height) {
3312 filter_x, filter_y);
3313 }
else if (0 == (
height % 8)) {
3315 filter_x, filter_y,
height);
3323 const int8_t *filter_x,
3324 const int8_t *filter_y,
3327 v16u8 out0, out1, out2;
3328 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3329 v16i8 src7, src8, src9, src10;
3331 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3334 v8i16 filt_h0, filt_h1, filter_vec;
3335 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 src -= (src_stride + 1);
3347 filter_vec =
LD_SH(filter_x);
3350 filter_vec =
LD_SH(filter_y);
3358 src += (3 * src_stride);
3364 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3373 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3376 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3386 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3405 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3420 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3434 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3442 const int8_t *filter_x,
3443 const int8_t *filter_y)
3446 v16i8
src0,
src1, src2, src3, src4;
3448 v8i16 filt_h0, filt_h1, filter_vec;
3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452 v8i16 dst0, dst1, dst2, dst3, dst4;
3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456 v8i16 out0_r, out1_r;
3458 src -= (src_stride + 1);
3460 filter_vec =
LD_SH(filter_x);
3463 filter_vec =
LD_SH(filter_y);
3475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3492 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3504 const int8_t *filter_x,
3505 const int8_t *filter_y,
3510 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
3511 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3518 src -= (src_stride + 1);
3520 filter_vec =
LD_SH(filter_x);
3523 filter_vec =
LD_SH(filter_y);
3531 for (cnt = width8mult; cnt--;) {
3538 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3547 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3571 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3574 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575 dst3_r, tmp0, tmp1, tmp2, tmp3);
3580 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3589 const int8_t *filter_x,
3590 const int8_t *filter_y)
3592 v16u8 out0, out1, out2;
3593 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3595 v8i16 filt_h0, filt_h1, filter_vec;
3598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3609 src -= (src_stride + 1);
3611 filter_vec =
LD_SH(filter_x);
3614 filter_vec =
LD_SH(filter_y);
3622 src += (5 * src_stride);
3623 LD_SB4(
src, src_stride, src5, src6, src7, src8);
3630 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3670 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3674 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3678 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3684 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3692 const int8_t *filter_x,
3693 const int8_t *filter_y,
3697 uint32_t loop_cnt, cnt;
3701 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3703 v8i16 filt_h0, filt_h1, filter_vec;
3706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712 v8i16 out0_r, out1_r, out2_r, out3_r;
3714 src -= (src_stride + 1);
3716 filter_vec =
LD_SH(filter_x);
3719 filter_vec =
LD_SH(filter_y);
3726 for (cnt = width8mult; cnt--;) {
3731 src_tmp += (3 * src_stride);
3737 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3746 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3747 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748 src_tmp += (4 * src_stride);
3752 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3776 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3780 dst2_l, dst2_r, dst3_l, dst3_r,
3781 out0_r, out1_r, out2_r, out3_r);
3784 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3787 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788 dst_tmp += (4 * dst_stride);
3806 const int8_t *filter_x,
3807 const int8_t *filter_y,
3812 filter_x, filter_y);
3813 }
else if (4 ==
height) {
3815 filter_x, filter_y, 1);
3816 }
else if (6 ==
height) {
3818 filter_x, filter_y);
3819 }
else if (0 == (
height % 4)) {
3821 filter_x, filter_y,
height, 1);
3829 const int8_t *filter_x,
3830 const int8_t *filter_y,
3834 uint8_t *src_tmp, *dst_tmp;
3836 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838 v16i8 mask0, mask1, mask2, mask3;
3839 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3848 src -= (src_stride + 1);
3850 filter_vec =
LD_SH(filter_x);
3853 filter_vec =
LD_SH(filter_y);
3865 src_tmp += (3 * src_stride);
3871 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3880 for (loop_cnt = 4; loop_cnt--;) {
3881 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882 src_tmp += (4 * src_stride);
3885 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3909 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3912 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913 dst3_r, tmp0, tmp1, tmp2, tmp3);
3918 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919 dst_tmp += (4 * dst_stride);
3935 src += (3 * src_stride);
3944 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3946 for (loop_cnt = 2; loop_cnt--;) {
3948 src3, src4, src5, src6, src7, src8, src9, src10);
3949 src += (8 * src_stride);
3951 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3961 dst32_r = __msa_ilvr_h(dst73, dst22);
3965 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966 dst76_r = __msa_ilvr_h(dst22, dst106);
3976 SRA_4V(dst0, dst1, dst2, dst3, 6);
3977 SRA_4V(dst4, dst5, dst6, dst7, 6);
3978 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979 tmp0, tmp1, tmp2, tmp3);
3984 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985 dst += (8 * dst_stride);
3989 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3997 const int8_t *filter_x,
3998 const int8_t *filter_y,
4006 filter_x, filter_y,
height, 2);
4014 const int8_t *filter_x,
4015 const int8_t *filter_y,
4019 filter_x, filter_y,
height, 3);
4026 const int8_t *filter_x,
4027 const int8_t *filter_y,
4031 filter_x, filter_y,
height, 4);
4034 #define UNI_MC_COPY(WIDTH) \
4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4036 ptrdiff_t dst_stride, \
4038 ptrdiff_t src_stride, \
4044 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4059 ptrdiff_t dst_stride, \
4061 ptrdiff_t src_stride, \
4067 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4069 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4073 UNI_MC(qpel,
h, 4, 8, hz, mx);
4074 UNI_MC(qpel,
h, 8, 8, hz, mx);
4075 UNI_MC(qpel,
h, 12, 8, hz, mx);
4076 UNI_MC(qpel,
h, 16, 8, hz, mx);
4077 UNI_MC(qpel,
h, 24, 8, hz, mx);
4078 UNI_MC(qpel,
h, 32, 8, hz, mx);
4079 UNI_MC(qpel,
h, 48, 8, hz, mx);
4080 UNI_MC(qpel,
h, 64, 8, hz, mx);
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4091 UNI_MC(epel,
h, 4, 4, hz, mx);
4092 UNI_MC(epel,
h, 6, 4, hz, mx);
4093 UNI_MC(epel,
h, 8, 4, hz, mx);
4094 UNI_MC(epel,
h, 12, 4, hz, mx);
4095 UNI_MC(epel,
h, 16, 4, hz, mx);
4096 UNI_MC(epel,
h, 24, 4, hz, mx);
4097 UNI_MC(epel,
h, 32, 4, hz, mx);
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4111 ptrdiff_t dst_stride, \
4113 ptrdiff_t src_stride, \
4119 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4120 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4122 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4123 filter_x, filter_y, height); \