26 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
31 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
39 v16i8 tmp0_m, tmp1_m; \
40 v16i8 minus5b_m = __msa_ldi_b(-5); \
41 v16i8 plus20b_m = __msa_ldi_b(20); \
43 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
44 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
45 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
46 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
47 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
48 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
55 v16i8 minus5b = __msa_ldi_b(-5); \
56 v16i8 plus20b = __msa_ldi_b(20); \
58 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
59 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
61 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
62 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
64 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
65 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
74 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
75 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
76 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
85 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
86 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
87 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
88 out0_m = __msa_srari_w(out0_m, 10); \
89 out0_m = __msa_sat_s_w(out0_m, 7); \
96 const int16_t filt_const0 = 0xfb01;
97 const int16_t filt_const1 = 0x1414;
98 const int16_t filt_const2 = 0x1fb;
100 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
106 filt0 = (v16i8) __msa_fill_h(filt_const0);
107 filt1 = (v16i8) __msa_fill_h(filt_const1);
108 filt2 = (v16i8) __msa_fill_h(filt_const2);
112 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
115 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
122 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
130 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
132 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
138 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
142 vt_out1 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
147 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
158 const int16_t filt_const0 = 0xfb01;
159 const int16_t filt_const1 = 0x1414;
160 const int16_t filt_const2 = 0x1fb;
162 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164 v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
171 filt0 = (v16i8) __msa_fill_h(filt_const0);
172 filt1 = (v16i8) __msa_fill_h(filt_const1);
173 filt2 = (v16i8) __msa_fill_h(filt_const2);
176 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
181 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
190 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
193 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
197 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
203 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
205 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
207 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
209 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
212 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
217 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
226 LD_SB4(src_y,
stride, src_vt9, src_vt10, src_vt11, src_vt12);
234 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
237 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
240 vt_out0 =
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
242 vt_out1 =
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
244 vt_out2 =
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
246 vt_out3 =
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
248 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
251 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
263 const uint8_t *src_y, uint8_t *dst,
266 const int16_t filt_const0 = 0xfb01;
267 const int16_t filt_const1 = 0x1414;
268 const int16_t filt_const2 = 0x1fb;
269 const uint8_t *src_x_tmp = src_x;
270 const uint8_t *src_y_tmp = src_y;
271 uint8_t *dst_tmp = dst;
272 uint32_t multiple8_cnt, loop_cnt;
274 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
275 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
276 v16i8 src_vt7, src_vt8;
277 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
278 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
279 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
280 v8i16 vt_out3, out0, out1, out2, out3;
282 filt0 = (v16i8) __msa_fill_h(filt_const0);
283 filt1 = (v16i8) __msa_fill_h(filt_const1);
284 filt2 = (v16i8) __msa_fill_h(filt_const2);
288 for (multiple8_cnt = 2; multiple8_cnt--;) {
293 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
298 for (loop_cnt = 4; loop_cnt--;) {
299 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
307 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
308 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
310 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
314 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
315 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
317 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
318 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
320 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
322 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
324 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
326 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
328 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
329 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
331 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
332 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
333 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
334 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
356 const uint8_t *src_y,
360 uint32_t tp0, tp1, tp2, tp3;
361 const int16_t filt_const0 = 0xfb01;
362 const int16_t filt_const1 = 0x1414;
363 const int16_t filt_const2 = 0x1fb;
364 v16u8 res, dst0 = { 0 };
365 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
366 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
367 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
368 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
369 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
371 filt0 = (v16i8) __msa_fill_h(filt_const0);
372 filt1 = (v16i8) __msa_fill_h(filt_const1);
373 filt2 = (v16i8) __msa_fill_h(filt_const2);
377 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
380 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
381 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
382 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
383 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
387 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
395 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
397 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
398 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
399 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
400 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
403 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
404 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
405 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
407 vt_out1 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
414 res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
415 res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
419 dst0 = __msa_aver_u_b(res, dst0);
425 const uint8_t *src_y,
429 const int16_t filt_const0 = 0xfb01;
430 const int16_t filt_const1 = 0x1414;
431 const int16_t filt_const2 = 0x1fb;
432 uint64_t tp0, tp1, tp2, tp3;
433 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
434 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
435 v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
436 v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
437 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
438 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
439 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
440 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
441 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
443 filt0 = (v16i8) __msa_fill_h(filt_const0);
444 filt1 = (v16i8) __msa_fill_h(filt_const1);
445 filt2 = (v16i8) __msa_fill_h(filt_const2);
448 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
453 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
462 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
463 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
465 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
469 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
470 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
471 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
472 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
473 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
475 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
477 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
479 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
481 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
482 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
484 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
485 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
486 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
487 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
489 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
503 LD_SB4(src_y,
stride, src_vt9, src_vt10, src_vt11, src_vt12);
511 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
512 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
514 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
515 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
517 vt_out0 =
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
519 vt_out1 =
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
521 vt_out2 =
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
523 vt_out3 =
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
525 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
526 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
528 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
529 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
530 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
531 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
545 const uint8_t *src_y,
549 const int16_t filt_const0 = 0xfb01;
550 const int16_t filt_const1 = 0x1414;
551 const int16_t filt_const2 = 0x1fb;
552 const uint8_t *src_x_tmp = src_x;
553 const uint8_t *src_y_tmp = src_y;
554 uint8_t *dst_tmp = dst;
555 uint32_t multiple8_cnt, loop_cnt;
556 uint64_t tp0, tp1, tp2, tp3;
557 v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
558 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
559 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
560 v16i8 src_vt7, src_vt8;
561 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
562 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
563 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
564 v8i16 vt_out3, out0, out1, out2, out3;
566 filt0 = (v16i8) __msa_fill_h(filt_const0);
567 filt1 = (v16i8) __msa_fill_h(filt_const1);
568 filt2 = (v16i8) __msa_fill_h(filt_const2);
572 for (multiple8_cnt = 2; multiple8_cnt--;) {
577 LD_SB5(src_y,
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
582 for (loop_cnt = 4; loop_cnt--;) {
583 LD_SB4(src_x,
stride, src_hz0, src_hz1, src_hz2, src_hz3);
591 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
592 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
594 LD_SB4(src_y,
stride, src_vt5, src_vt6, src_vt7, src_vt8);
598 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
599 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
601 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
602 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
604 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
606 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
608 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
610 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
612 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
613 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
615 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
616 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
617 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
618 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
647 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
648 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
652 LD_UB8(
src,
stride, src8, src9, src10, src11, src12, src13, src14, src15);
656 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
stride);
662 uint64_t
src0,
src1, src2, src3, src4, src5, src6, src7;
669 SD4(src4, src5, src6, src7, dst,
stride);
675 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
676 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
680 LD_UB8(dst,
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
682 AVER_UB4_UB(
src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
684 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
686 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst,
stride);
690 LD_UB8(dst,
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
692 AVER_UB4_UB(
src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
694 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
696 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst,
stride);
702 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
703 v16u8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
704 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
721 AVER_UB4_UB(
src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
724 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
730 uint32_t tp0, tp1, tp2, tp3;
731 v16u8
src0 = { 0 }, dst0 = { 0 };
738 dst0 = __msa_aver_u_b(
src0, dst0);
747 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
748 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
749 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
750 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
751 v16i8 minus5b = __msa_ldi_b(-5);
752 v16i8 plus20b = __msa_ldi_b(20);
760 for (loop_cnt = 4; loop_cnt--;) {
772 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
774 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
776 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
777 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
778 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
779 minus5b, res0, res1, res2, res3);
780 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
781 plus20b, res0, res1, res2, res3);
782 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
783 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
784 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
785 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
786 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
787 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
788 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
789 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
790 minus5b, res4, res5, res6, res7);
791 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
792 plus20b, res4, res5, res6, res7);
794 src0, src2, src4, src6);
801 dst0 = __msa_aver_s_b(dst0,
src0);
802 dst1 = __msa_aver_s_b(dst1, src2);
803 dst2 = __msa_aver_s_b(dst2, src4);
804 dst3 = __msa_aver_s_b(dst3, src6);
815 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
816 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
817 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
818 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
819 v16i8 minus5b = __msa_ldi_b(-5);
820 v16i8 plus20b = __msa_ldi_b(20);
828 for (loop_cnt = 4; loop_cnt--;) {
840 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
842 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
844 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
845 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
846 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
847 minus5b, res0, res1, res2, res3);
848 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
849 plus20b, res0, res1, res2, res3);
850 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
851 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
852 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
853 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
854 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
855 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
856 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
857 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
858 minus5b, res4, res5, res6, res7);
859 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
860 plus20b, res4, res5, res6, res7);
862 src0, src2, src4, src6);
869 dst0 = __msa_aver_s_b(dst0,
src0);
870 dst1 = __msa_aver_s_b(dst1, src2);
871 dst2 = __msa_aver_s_b(dst2, src4);
872 dst3 = __msa_aver_s_b(dst3, src6);
882 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
883 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
884 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
885 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
886 v16i8 minus5b = __msa_ldi_b(-5);
887 v16i8 plus20b = __msa_ldi_b(20);
893 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
894 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
896 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
897 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
898 res0, res1, res2, res3);
900 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
901 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
902 res0, res1, res2, res3);
903 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
904 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
905 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
906 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
907 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
908 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
909 res4, res5, res6, res7);
910 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
911 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
912 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
913 res4, res5, res6, res7);
916 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
917 src4, src5, src6, src7);
926 tmp0 = __msa_aver_s_b(tmp0,
src0);
927 tmp1 = __msa_aver_s_b(tmp1,
src1);
928 tmp2 = __msa_aver_s_b(tmp2, src4);
929 tmp3 = __msa_aver_s_b(tmp3, src5);
931 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
937 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
938 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
939 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
940 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
941 v16i8 minus5b = __msa_ldi_b(-5);
942 v16i8 plus20b = __msa_ldi_b(20);
948 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
949 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
951 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
952 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
953 res0, res1, res2, res3);
955 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
956 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
957 res0, res1, res2, res3);
958 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
959 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
960 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
961 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
962 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
963 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
964 res4, res5, res6, res7);
965 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
966 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
967 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
968 res4, res5, res6, res7);
971 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
972 src4, src5, src6, src7);
981 tmp0 = __msa_aver_s_b(tmp0,
src0);
982 tmp1 = __msa_aver_s_b(tmp1,
src1);
983 tmp2 = __msa_aver_s_b(tmp2, src4);
984 tmp3 = __msa_aver_s_b(tmp3, src5);
986 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
992 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
993 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
995 v16i8 minus5b = __msa_ldi_b(-5);
996 v16i8 plus20b = __msa_ldi_b(20);
1004 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1006 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1009 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1012 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32)
src1);
1013 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1014 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64)
src1);
1015 res = __msa_aver_s_b(res,
src0);
1016 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1023 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
1024 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1026 v16i8 minus5b = __msa_ldi_b(-5);
1027 v16i8 plus20b = __msa_ldi_b(20);
1035 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1037 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1040 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1043 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32)
src1);
1044 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1045 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64)
src1);
1046 res = __msa_aver_s_b(res,
src0);
1047 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1055 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1056 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1058 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1059 v16i8 minus5b = __msa_ldi_b(-5);
1060 v16i8 plus20b = __msa_ldi_b(20);
1065 for (loop_cnt = 4; loop_cnt--;) {
1077 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1079 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1081 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1082 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1083 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1084 minus5b, res0, res1, res2, res3);
1085 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1086 plus20b, res0, res1, res2, res3);
1087 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1088 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1089 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1090 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1091 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1092 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1093 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1094 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1095 minus5b, res4, res5, res6, res7);
1096 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1097 plus20b, res4, res5, res6, res7);
1102 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1113 v16u8 out0, out1, out2, out3;
1114 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1115 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1117 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1118 v16i8 minus5b = __msa_ldi_b(-5);
1119 v16i8 plus20b = __msa_ldi_b(20);
1125 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1126 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1128 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1129 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1130 res0, res1, res2, res3);
1132 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1133 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1134 plus20b, res0, res1, res2, res3);
1135 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1136 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1137 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1138 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1139 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1140 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1141 res4, res5, res6, res7);
1142 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1143 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1144 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1145 plus20b, res4, res5, res6, res7);
1154 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
1161 v16i8
src0,
src1, src2, src3, mask0, mask1, mask2;
1162 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1164 v16i8 minus5b = __msa_ldi_b(-5);
1165 v16i8 plus20b = __msa_ldi_b(20);
1173 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1175 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1186 int16_t filt_const0 = 0xfb01;
1187 int16_t filt_const1 = 0x1414;
1188 int16_t filt_const2 = 0x1fb;
1189 v16u8 res0, res1, res2, res3;
1190 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1191 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1192 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1193 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1194 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1196 filt0 = (v16i8) __msa_fill_h(filt_const0);
1197 filt1 = (v16i8) __msa_fill_h(filt_const1);
1198 filt2 = (v16i8) __msa_fill_h(filt_const2);
1206 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1208 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
1211 for (loop_cnt = 4; loop_cnt--;) {
1216 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1217 src65_r, src76_r, src87_r);
1218 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1219 src65_l, src76_l, src87_l);
1220 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1221 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1222 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1223 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1224 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1225 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1226 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1227 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1229 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1231 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1232 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1233 out3_r, res0, res1, res2, res3);
1234 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1235 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1236 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1237 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1260 int16_t filt_const0 = 0xfb01;
1261 int16_t filt_const1 = 0x1414;
1262 int16_t filt_const2 = 0x1fb;
1263 v16u8 res0, res1, res2, res3;
1264 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1265 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1266 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1267 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1268 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1270 filt0 = (v16i8) __msa_fill_h(filt_const0);
1271 filt1 = (v16i8) __msa_fill_h(filt_const1);
1272 filt2 = (v16i8) __msa_fill_h(filt_const2);
1280 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1282 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
1285 for (loop_cnt = 4; loop_cnt--;) {
1290 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1291 src65_r, src76_r, src87_r);
1292 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1293 src65_l, src76_l, src87_l);
1294 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1295 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1296 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1297 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1298 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1299 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1300 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1301 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1303 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1305 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1306 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1307 out3_r, res0, res1, res2, res3);
1308 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1309 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1310 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1311 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1332 const int16_t filt_const0 = 0xfb01;
1333 const int16_t filt_const1 = 0x1414;
1334 const int16_t filt_const2 = 0x1fb;
1335 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1336 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1337 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1338 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1339 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1341 filt0 = (v16i8) __msa_fill_h(filt_const0);
1342 filt1 = (v16i8) __msa_fill_h(filt_const1);
1343 filt2 = (v16i8) __msa_fill_h(filt_const2);
1349 LD_SB8(
src,
stride, src5, src6, src7, src8, src9, src10, src11, src12);
1350 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1352 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1354 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1356 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1357 src109_r, src1110_r, src1211_r);
1358 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1359 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1360 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1361 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1362 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1363 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1364 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1365 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1370 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1371 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1372 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1373 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1374 out0 = __msa_aver_s_b(out0, tmp0);
1375 out1 = __msa_aver_s_b(out1, tmp1);
1376 out2 = __msa_aver_s_b(out2, tmp2);
1377 out3 = __msa_aver_s_b(out3, tmp3);
1379 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
1385 const int16_t filt_const0 = 0xfb01;
1386 const int16_t filt_const1 = 0x1414;
1387 const int16_t filt_const2 = 0x1fb;
1388 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1389 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1390 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1391 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1392 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1394 filt0 = (v16i8) __msa_fill_h(filt_const0);
1395 filt1 = (v16i8) __msa_fill_h(filt_const1);
1396 filt2 = (v16i8) __msa_fill_h(filt_const2);
1402 LD_SB8(
src,
stride, src5, src6, src7, src8, src9, src10, src11, src12);
1404 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1405 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1407 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1409 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1410 src109_r, src1110_r, src1211_r);
1411 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1412 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1413 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1414 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1415 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1416 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1417 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1418 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1423 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1424 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1425 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1426 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1427 out0 = __msa_aver_s_b(out0, tmp0);
1428 out1 = __msa_aver_s_b(out1, tmp1);
1429 out2 = __msa_aver_s_b(out2, tmp2);
1430 out3 = __msa_aver_s_b(out3, tmp3);
1432 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
1438 int16_t filt_const0 = 0xfb01;
1439 int16_t filt_const1 = 0x1414;
1440 int16_t filt_const2 = 0x1fb;
1442 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1443 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1444 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1447 filt0 = (v16i8) __msa_fill_h(filt_const0);
1448 filt1 = (v16i8) __msa_fill_h(filt_const1);
1449 filt2 = (v16i8) __msa_fill_h(filt_const2);
1455 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1457 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1460 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1462 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1464 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1465 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1469 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1470 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1471 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1472 out = __msa_aver_u_b(
out, (v16u8) src32_r);
1479 int16_t filt_const0 = 0xfb01;
1480 int16_t filt_const1 = 0x1414;
1481 int16_t filt_const2 = 0x1fb;
1483 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1484 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1485 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1488 filt0 = (v16i8) __msa_fill_h(filt_const0);
1489 filt1 = (v16i8) __msa_fill_h(filt_const1);
1490 filt2 = (v16i8) __msa_fill_h(filt_const2);
1496 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
1498 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1501 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1503 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1505 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1506 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1510 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1511 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1512 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1513 out = __msa_aver_u_b(
out, (v16u8) src32_r);
1597 uint8_t *dst_tmp = dst;
1598 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
1599 uint32_t multiple8_cnt, loop_cnt;
1600 const int32_t filt_const0 = 0xfffb0001;
1601 const int32_t filt_const1 = 0x140014;
1602 const int32_t filt_const2 = 0x1fffb;
1604 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1606 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1607 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1608 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1609 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1610 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1611 v8i16 hz_out87_l, filt0, filt1, filt2;
1614 filt0 = (v8i16) __msa_fill_w(filt_const0);
1615 filt1 = (v8i16) __msa_fill_w(filt_const1);
1616 filt2 = (v8i16) __msa_fill_w(filt_const2);
1620 for (multiple8_cnt = 2; multiple8_cnt--;) {
1634 for (loop_cnt = 4; loop_cnt--;) {
1645 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1646 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1648 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1649 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1651 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1652 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1654 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1655 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1662 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1667 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1672 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1677 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1679 dst1 = __msa_srari_h(hz_out2, 5);
1680 dst3 = __msa_srari_h(hz_out3, 5);
1681 dst5 = __msa_srari_h(hz_out4, 5);
1682 dst7 = __msa_srari_h(hz_out5, 5);
1685 dst0 = __msa_aver_s_h(dst0, dst1);
1686 dst1 = __msa_aver_s_h(dst2, dst3);
1687 dst2 = __msa_aver_s_h(dst4, dst5);
1688 dst3 = __msa_aver_s_h(dst6, dst7);
1710 uint8_t *dst_tmp = dst;
1711 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
1712 uint32_t multiple8_cnt, loop_cnt;
1713 const int32_t filt_const0 = 0xfffb0001;
1714 const int32_t filt_const1 = 0x140014;
1715 const int32_t filt_const2 = 0x1fffb;
1717 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1719 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1720 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1721 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1722 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1723 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1724 v8i16 hz_out87_l, filt0, filt1, filt2;
1727 filt0 = (v8i16) __msa_fill_w(filt_const0);
1728 filt1 = (v8i16) __msa_fill_w(filt_const1);
1729 filt2 = (v8i16) __msa_fill_w(filt_const2);
1733 for (multiple8_cnt = 2; multiple8_cnt--;) {
1747 for (loop_cnt = 4; loop_cnt--;) {
1758 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1759 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1761 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1762 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1764 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1765 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1767 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1768 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1775 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1780 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1785 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1790 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1792 dst1 = __msa_srari_h(hz_out3, 5);
1793 dst3 = __msa_srari_h(hz_out4, 5);
1794 dst5 = __msa_srari_h(hz_out5, 5);
1795 dst7 = __msa_srari_h(hz_out6, 5);
1798 dst0 = __msa_aver_s_h(dst0, dst1);
1799 dst1 = __msa_aver_s_h(dst2, dst3);
1800 dst2 = __msa_aver_s_h(dst4, dst5);
1801 dst3 = __msa_aver_s_h(dst6, dst7);
1823 const int32_t filt_const0 = 0xfffb0001;
1824 const int32_t filt_const1 = 0x140014;
1825 const int32_t filt_const2 = 0x1fffb;
1827 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1828 v16i8 src11, src12, mask0, mask1, mask2;
1829 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1830 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1831 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1832 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1833 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1834 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1835 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1836 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1841 filt0 = (v8i16) __msa_fill_w(filt_const0);
1842 filt1 = (v8i16) __msa_fill_w(filt_const1);
1843 filt2 = (v8i16) __msa_fill_w(filt_const2);
1866 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1867 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1868 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1870 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1871 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1872 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1875 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1877 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1879 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1880 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1882 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1884 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1885 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1887 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1889 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1890 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1892 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1894 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1896 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1897 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1899 dst0 = __msa_aver_s_h(dst0, hz_out2);
1900 dst1 = __msa_aver_s_h(dst1, hz_out3);
1901 dst2 = __msa_aver_s_h(dst2, hz_out4);
1902 dst3 = __msa_aver_s_h(dst3, hz_out5);
1915 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1916 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1918 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1919 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1921 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1923 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1925 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1926 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1928 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1930 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1931 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1933 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1935 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1936 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1938 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1940 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1942 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1943 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1945 dst0 = __msa_aver_s_h(dst0, hz_out6);
1946 dst1 = __msa_aver_s_h(dst1, hz_out7);
1947 dst2 = __msa_aver_s_h(dst2, hz_out8);
1948 dst3 = __msa_aver_s_h(dst3, hz_out9);
1958 const int32_t filt_const0 = 0xfffb0001;
1959 const int32_t filt_const1 = 0x140014;
1960 const int32_t filt_const2 = 0x1fffb;
1962 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1963 v16i8 src11, src12, mask0, mask1, mask2;
1964 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1965 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1966 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1967 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1968 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1969 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1970 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1971 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1976 filt0 = (v8i16) __msa_fill_w(filt_const0);
1977 filt1 = (v8i16) __msa_fill_w(filt_const1);
1978 filt2 = (v8i16) __msa_fill_w(filt_const2);
2001 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2002 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2003 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2005 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2006 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2007 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2010 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2012 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2014 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2015 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2017 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2019 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2020 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2022 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2024 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2025 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2027 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2029 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2031 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2032 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2034 dst0 = __msa_aver_s_h(dst0, hz_out3);
2035 dst1 = __msa_aver_s_h(dst1, hz_out4);
2036 dst2 = __msa_aver_s_h(dst2, hz_out5);
2037 dst3 = __msa_aver_s_h(dst3, hz_out6);
2050 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2051 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2053 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2054 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2056 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2058 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2060 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2061 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2063 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2065 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2066 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2068 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2070 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2071 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2073 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2075 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2077 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2078 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2080 dst0 = __msa_aver_s_h(dst0, hz_out7);
2081 dst1 = __msa_aver_s_h(dst1, hz_out8);
2082 dst2 = __msa_aver_s_h(dst2, hz_out9);
2083 dst3 = __msa_aver_s_h(dst3, hz_out10);
2093 const int32_t filt_const0 = 0xfffb0001;
2094 const int32_t filt_const1 = 0x140014;
2095 const int32_t filt_const2 = 0x1fffb;
2097 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2098 v16i8 mask0, mask1, mask2;
2099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2107 filt0 = (v8i16) __msa_fill_w(filt_const0);
2108 filt1 = (v8i16) __msa_fill_w(filt_const1);
2109 filt2 = (v8i16) __msa_fill_w(filt_const2);
2125 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2126 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2128 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2129 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2130 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2131 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2133 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2135 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2137 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2138 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2140 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2142 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2147 dst0 = __msa_aver_s_h(dst0, hz_out2);
2148 dst1 = __msa_aver_s_h(dst1, hz_out4);
2157 const int32_t filt_const0 = 0xfffb0001;
2158 const int32_t filt_const1 = 0x140014;
2159 const int32_t filt_const2 = 0x1fffb;
2161 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2162 v16i8 mask0, mask1, mask2;
2163 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2164 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2165 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2166 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2171 filt0 = (v8i16) __msa_fill_w(filt_const0);
2172 filt1 = (v8i16) __msa_fill_w(filt_const1);
2173 filt2 = (v8i16) __msa_fill_w(filt_const2);
2189 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2190 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2192 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2193 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2194 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2195 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2197 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2199 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2201 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2202 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2204 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2206 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2208 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2212 dst0 = __msa_aver_s_h(dst0, hz_out0);
2213 dst1 = __msa_aver_s_h(dst1, hz_out1);
2223 int16_t filt_const0 = 0xfb01;
2224 int16_t filt_const1 = 0x1414;
2225 int16_t filt_const2 = 0x1fb;
2226 v16u8 res0, res1, res2, res3;
2227 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2228 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2229 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2230 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2231 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2233 filt0 = (v16i8) __msa_fill_h(filt_const0);
2234 filt1 = (v16i8) __msa_fill_h(filt_const1);
2235 filt2 = (v16i8) __msa_fill_h(filt_const2);
2242 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
2244 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
2247 for (loop_cnt = 4; loop_cnt--;) {
2252 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2253 src65_r, src76_r, src87_r);
2254 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2255 src65_l, src76_l, src87_l);
2256 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2257 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2258 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2259 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2260 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2261 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2262 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2263 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2265 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2267 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2268 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2269 out3_r, res0, res1, res2, res3);
2289 const int16_t filt_const0 = 0xfb01;
2290 const int16_t filt_const1 = 0x1414;
2291 const int16_t filt_const2 = 0x1fb;
2292 v16u8 out0, out1, out2, out3;
2293 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2295 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2296 v16i8 filt0, filt1, filt2;
2297 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2299 filt0 = (v16i8) __msa_fill_h(filt_const0);
2300 filt1 = (v16i8) __msa_fill_h(filt_const1);
2301 filt2 = (v16i8) __msa_fill_h(filt_const2);
2308 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
2310 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2312 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2313 src910_r, src1110_r, src1211_r);
2317 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2318 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2319 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2320 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2321 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2322 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2323 out6_r =
AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2324 out7_r =
AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2327 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2328 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2333 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
2339 const int16_t filt_const0 = 0xfb01;
2340 const int16_t filt_const1 = 0x1414;
2341 const int16_t filt_const2 = 0x1fb;
2343 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2344 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2345 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2348 filt0 = (v16i8) __msa_fill_h(filt_const0);
2349 filt1 = (v16i8) __msa_fill_h(filt_const1);
2350 filt2 = (v16i8) __msa_fill_h(filt_const2);
2358 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
2360 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2362 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2363 src76_r, src2110, src4332, src6554, src8776);
2365 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2366 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2378 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2380 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2381 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2382 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2383 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2384 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2385 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2386 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2387 v8i16 minus5h = __msa_ldi_h(-5);
2388 v8i16 plus20h = __msa_ldi_h(20);
2402 for (row = 16; row--;) {
2411 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2412 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2413 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2414 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2415 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2416 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2417 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2418 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2419 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2420 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2421 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2422 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2423 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2424 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2425 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2426 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2427 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2428 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2429 dst0 = __msa_srari_h(shf_vec2, 5);
2430 dst1 = __msa_srari_h(shf_vec5, 5);
2431 dst2 = __msa_srari_h(shf_vec8, 5);
2432 dst3 = __msa_srari_h(shf_vec11, 5);
2435 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2436 dst0 = __msa_aver_s_h(dst2, dst0);
2437 dst1 = __msa_aver_s_h(dst3, dst1);
2460 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2462 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2463 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2464 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2465 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2466 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2467 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2468 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2469 v8i16 minus5h = __msa_ldi_h(-5);
2470 v8i16 plus20h = __msa_ldi_h(20);
2484 for (row = 16; row--;) {
2493 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2494 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2495 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2496 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2497 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2498 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2499 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2500 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2501 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2502 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2503 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2504 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2505 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2506 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2507 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2508 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2509 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2510 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2511 dst0 = __msa_srari_h(shf_vec2, 5);
2512 dst1 = __msa_srari_h(shf_vec5, 5);
2513 dst2 = __msa_srari_h(shf_vec8, 5);
2514 dst3 = __msa_srari_h(shf_vec11, 5);
2516 dst0 = __msa_pckod_h(dst2, dst0);
2517 dst1 = __msa_pckod_h(dst3, dst1);
2518 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2519 dst0 = __msa_aver_s_h(dst2, dst0);
2520 dst1 = __msa_aver_s_h(dst3, dst1);
2543 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2544 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2545 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2546 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2547 v8i16 mask3, mask4, mask5;
2548 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2549 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2550 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2551 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2552 v8i16 minus5h = __msa_ldi_h(-5);
2553 v8i16 plus20h = __msa_ldi_h(20);
2565 for (row = 4; row--;) {
2574 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2575 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2576 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2577 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2578 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2579 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2580 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2581 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2582 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2583 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2584 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2585 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2586 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2587 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2588 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2589 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2590 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2591 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2592 dst0 = __msa_srari_h(shf_vec2, 5);
2593 dst1 = __msa_srari_h(shf_vec5, 5);
2594 dst2 = __msa_srari_h(shf_vec8, 5);
2595 dst3 = __msa_srari_h(shf_vec11, 5);
2598 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2599 dst0 = __msa_aver_s_h(dst2, dst0);
2600 dst1 = __msa_aver_s_h(dst3, dst1);
2618 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2619 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2620 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2621 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2622 v8i16 mask3, mask4, mask5;
2623 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2624 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2625 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2626 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2627 v8i16 minus5h = __msa_ldi_h(-5);
2628 v8i16 plus20h = __msa_ldi_h(20);
2640 for (row = 4; row--;) {
2649 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2650 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2651 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2652 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2653 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2654 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2655 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2656 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2657 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2658 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2659 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2660 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2661 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2662 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2663 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2664 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2665 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2666 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2667 dst0 = __msa_srari_h(shf_vec2, 5);
2668 dst1 = __msa_srari_h(shf_vec5, 5);
2669 dst2 = __msa_srari_h(shf_vec8, 5);
2670 dst3 = __msa_srari_h(shf_vec11, 5);
2672 dst0 = __msa_pckod_h(dst2, dst0);
2673 dst1 = __msa_pckod_h(dst3, dst1);
2674 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2675 dst0 = __msa_aver_s_h(dst2, dst0);
2676 dst1 = __msa_aver_s_h(dst3, dst1);
2692 const int16_t filt_const0 = 0xfb01;
2693 const int16_t filt_const1 = 0x1414;
2694 const int16_t filt_const2 = 0x1fb;
2696 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2697 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2698 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2699 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2700 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2701 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2702 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2703 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2704 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2705 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2706 v8i16 minus5h = __msa_ldi_h(-5);
2707 v8i16 plus20h = __msa_ldi_h(20);
2708 v8i16 zeros = { 0 };
2710 filt0 = (v16i8) __msa_fill_h(filt_const0);
2711 filt1 = (v16i8) __msa_fill_h(filt_const1);
2712 filt2 = (v16i8) __msa_fill_h(filt_const2);
2722 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
2724 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2726 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
2728 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2730 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2731 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2732 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2733 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2734 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2735 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2736 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2737 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2738 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2739 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2740 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2741 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2743 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2744 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2745 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2746 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2747 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2748 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2749 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2750 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2751 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2752 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2753 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2754 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2761 dst0 = __msa_srari_h(shf_vec2, 5);
2762 dst1 = __msa_srari_h(shf_vec5, 5);
2763 dst2 = __msa_srari_h(shf_vec6, 5);
2764 dst3 = __msa_srari_h(shf_vec7, 5);
2768 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2769 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2771 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2772 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2773 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2774 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2776 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2784 const int16_t filt_const0 = 0xfb01;
2785 const int16_t filt_const1 = 0x1414;
2786 const int16_t filt_const2 = 0x1fb;
2788 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2789 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2790 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2791 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2792 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2793 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2794 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2795 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2796 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2797 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2798 v8i16 minus5h = __msa_ldi_h(-5);
2799 v8i16 plus20h = __msa_ldi_h(20);
2800 v8i16 zeros = { 0 };
2802 filt0 = (v16i8) __msa_fill_h(filt_const0);
2803 filt1 = (v16i8) __msa_fill_h(filt_const1);
2804 filt2 = (v16i8) __msa_fill_h(filt_const2);
2814 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
2816 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2818 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
2820 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2823 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2824 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2825 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2826 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2827 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2828 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2829 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2830 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2831 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2832 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2833 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2834 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2836 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2837 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2838 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2839 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2840 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2841 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2842 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2843 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2844 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2845 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2846 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2847 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2854 dst0 = __msa_srari_h(shf_vec2, 5);
2855 dst1 = __msa_srari_h(shf_vec5, 5);
2856 dst2 = __msa_srari_h(shf_vec6, 5);
2857 dst3 = __msa_srari_h(shf_vec7, 5);
2862 dst0 = __msa_ilvod_h(zeros, dst0);
2863 dst1 = __msa_ilvod_h(zeros, dst1);
2864 dst2 = __msa_ilvod_h(zeros, dst2);
2865 dst3 = __msa_ilvod_h(zeros, dst3);
2867 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2868 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2869 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2870 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2872 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2880 const int32_t filt_const0 = 0xfffb0001;
2881 const int32_t filt_const1 = 0x140014;
2882 const int32_t filt_const2 = 0x1fffb;
2883 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
2884 uint8_t *dst_tmp = dst;
2885 uint32_t multiple8_cnt, loop_cnt;
2887 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
2888 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2889 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2890 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2891 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2892 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2893 v8i16 hz_out87_l, filt0, filt1, filt2;
2896 filt0 = (v8i16) __msa_fill_w(filt_const0);
2897 filt1 = (v8i16) __msa_fill_w(filt_const1);
2898 filt2 = (v8i16) __msa_fill_w(filt_const2);
2902 for (multiple8_cnt = 2; multiple8_cnt--;) {
2916 for (loop_cnt = 4; loop_cnt--;) {
2926 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2927 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2929 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2930 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2932 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2933 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2935 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2936 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2943 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2948 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2953 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2958 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2980 const int32_t filt_const0 = 0xfffb0001;
2981 const int32_t filt_const1 = 0x140014;
2982 const int32_t filt_const2 = 0x1fffb;
2984 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
2985 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2986 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2987 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2988 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2989 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2990 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2991 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2992 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2995 filt0 = (v8i16) __msa_fill_w(filt_const0);
2996 filt1 = (v8i16) __msa_fill_w(filt_const1);
2997 filt2 = (v8i16) __msa_fill_w(filt_const2);
3019 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3020 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3021 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3023 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3024 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3025 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3028 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3030 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3032 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3033 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3035 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3037 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3038 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3040 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3042 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3043 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3045 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3047 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3059 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3060 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3062 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3063 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3065 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3067 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3069 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3070 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3072 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3074 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3075 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3077 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3079 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3080 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3082 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3084 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3093 const int32_t filt_const0 = 0xfffb0001;
3094 const int32_t filt_const1 = 0x140014;
3095 const int32_t filt_const2 = 0x1fffb;
3097 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3098 v16i8 mask0, mask1, mask2;
3099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3107 filt0 = (v8i16) __msa_fill_w(filt_const0);
3108 filt1 = (v8i16) __msa_fill_w(filt_const1);
3109 filt2 = (v8i16) __msa_fill_w(filt_const2);
3124 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3125 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3126 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3127 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3128 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3129 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3131 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3133 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3135 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3136 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3138 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3140 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3149 v16u8 dst0, dst1, dst2, dst3;
3150 v16i8 out0, out1, out2, out3,
src0,
src1, src2, src3, src4, src5, src6;
3151 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3152 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3153 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3154 v16i8 minus5b = __msa_ldi_b(-5);
3155 v16i8 plus20b = __msa_ldi_b(20);
3163 for (loop_cnt = 4; loop_cnt--;) {
3176 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3178 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3180 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3181 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3182 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3183 minus5b, res0, res1, res2, res3);
3184 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3185 plus20b, res0, res1, res2, res3);
3186 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3187 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3188 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3189 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3190 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3191 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3192 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3193 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3194 minus5b, res4, res5, res6, res7);
3195 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3196 plus20b, res4, res5, res6, res7);
3198 src0, src2, src4, src6);
3205 out0 = __msa_aver_s_b(out0,
src0);
3206 out1 = __msa_aver_s_b(out1, src2);
3207 out2 = __msa_aver_s_b(out2, src4);
3208 out3 = __msa_aver_s_b(out3, src6);
3221 v16u8 dst0, dst1, dst2, dst3;
3222 v16i8 out0, out1, out2, out3,
src0,
src1, src2, src3, src4, src5, src6;
3223 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3224 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3225 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3226 v16i8 minus5b = __msa_ldi_b(-5);
3227 v16i8 plus20b = __msa_ldi_b(20);
3235 for (loop_cnt = 4; loop_cnt--;) {
3248 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3250 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3252 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3253 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3254 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3255 minus5b, res0, res1, res2, res3);
3256 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3257 plus20b, res0, res1, res2, res3);
3258 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3259 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3260 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3261 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3262 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3263 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3264 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3265 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3266 minus5b, res4, res5, res6, res7);
3267 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3268 plus20b, res4, res5, res6, res7);
3270 src0, src2, src4, src6);
3277 out0 = __msa_aver_s_b(out0,
src0);
3278 out1 = __msa_aver_s_b(out1, src2);
3279 out2 = __msa_aver_s_b(out2, src4);
3280 out3 = __msa_aver_s_b(out3, src6);
3292 uint64_t tp0, tp1, tp2, tp3;
3293 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3294 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3295 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3296 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3297 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3298 v16i8 minus5b = __msa_ldi_b(-5);
3299 v16i8 plus20b = __msa_ldi_b(20);
3305 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3306 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3308 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3309 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3310 res0, res1, res2, res3);
3312 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3313 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3314 res0, res1, res2, res3);
3315 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3316 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3317 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3318 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3319 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3320 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3321 res4, res5, res6, res7);
3322 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3323 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3324 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3325 res4, res5, res6, res7);
3328 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
3329 src4, src5, src6, src7);
3338 tmp0 = __msa_aver_s_b(tmp0,
src0);
3339 tmp1 = __msa_aver_s_b(tmp1,
src1);
3340 tmp2 = __msa_aver_s_b(tmp2, src4);
3341 tmp3 = __msa_aver_s_b(tmp3, src5);
3351 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
3357 uint64_t tp0, tp1, tp2, tp3;
3358 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3359 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3360 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3361 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3362 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3363 v16i8 minus5b = __msa_ldi_b(-5);
3364 v16i8 plus20b = __msa_ldi_b(20);
3370 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3371 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3373 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3374 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3375 res0, res1, res2, res3);
3377 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3378 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3379 res0, res1, res2, res3);
3380 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3381 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3382 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3383 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3384 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3385 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3386 res4, res5, res6, res7);
3387 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3388 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3389 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3390 res4, res5, res6, res7);
3393 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
3394 src4, src5, src6, src7);
3403 tmp0 = __msa_aver_s_b(tmp0,
src0);
3404 tmp1 = __msa_aver_s_b(tmp1,
src1);
3405 tmp2 = __msa_aver_s_b(tmp2, src4);
3406 tmp3 = __msa_aver_s_b(tmp3, src5);
3416 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
3422 uint32_t tp0, tp1, tp2, tp3;
3424 v16i8
src0,
src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3425 v16i8 mask0, mask1, mask2;
3427 v16i8 minus5b = __msa_ldi_b(-5);
3428 v16i8 plus20b = __msa_ldi_b(20);
3436 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3438 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3441 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3444 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32)
src1);
3445 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3446 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64)
src1);
3447 res = __msa_aver_s_b(res,
src0);
3448 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3451 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3458 uint32_t tp0, tp1, tp2, tp3;
3460 v16i8
src0,
src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3461 v16i8 mask0, mask1, mask2;
3463 v16i8 minus5b = __msa_ldi_b(-5);
3464 v16i8 plus20b = __msa_ldi_b(20);
3472 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3474 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3477 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3480 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32)
src1);
3481 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3482 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64)
src1);
3483 res = __msa_aver_s_b(res,
src0);
3484 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3487 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3495 v16u8 dst0, dst1, dst2, dst3;
3496 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3497 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3499 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3500 v16i8 minus5b = __msa_ldi_b(-5);
3501 v16i8 plus20b = __msa_ldi_b(20);
3506 for (loop_cnt = 4; loop_cnt--;) {
3519 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3521 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3523 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3524 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3525 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3526 minus5b, res0, res1, res2, res3);
3527 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3528 plus20b, res0, res1, res2, res3);
3529 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3530 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3531 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3532 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3533 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3534 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3535 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3536 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3537 minus5b, res4, res5, res6, res7);
3538 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3539 plus20b, res4, res5, res6, res7);
3544 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3557 uint64_t tp0, tp1, tp2, tp3;
3558 v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3559 v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3560 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3561 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3563 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3564 v16i8 minus5b = __msa_ldi_b(-5);
3565 v16i8 plus20b = __msa_ldi_b(20);
3572 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3573 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3575 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3576 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3577 res0, res1, res2, res3);
3579 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3580 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3581 res0, res1, res2, res3);
3582 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3583 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3584 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3585 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3586 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3587 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3588 res4, res5, res6, res7);
3589 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3590 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3591 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3592 res4, res5, res6, res7);
3609 ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
3615 uint32_t tp0, tp1, tp2, tp3;
3616 v16u8 res, dst0 = { 0 };
3617 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3618 v16i8 mask0, mask1, mask2;
3620 v16i8 minus5b = __msa_ldi_b(-5);
3621 v16i8 plus20b = __msa_ldi_b(20);
3629 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3631 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3637 res = __msa_aver_u_b(res, dst0);
3645 int16_t filt_const0 = 0xfb01;
3646 int16_t filt_const1 = 0x1414;
3647 int16_t filt_const2 = 0x1fb;
3648 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3649 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3650 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3651 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3652 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3653 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3655 filt0 = (v16i8) __msa_fill_h(filt_const0);
3656 filt1 = (v16i8) __msa_fill_h(filt_const1);
3657 filt2 = (v16i8) __msa_fill_h(filt_const2);
3665 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3667 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
3670 for (loop_cnt = 4; loop_cnt--;) {
3675 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3676 src65_r, src76_r, src87_r);
3677 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3678 src65_l, src76_l, src87_l);
3679 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3680 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3681 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3682 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3683 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3684 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3685 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3686 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3688 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3690 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3691 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3692 out3_r, res0, res1, res2, res3);
3693 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3694 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3695 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3696 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3722 int16_t filt_const0 = 0xfb01;
3723 int16_t filt_const1 = 0x1414;
3724 int16_t filt_const2 = 0x1fb;
3725 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3726 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3727 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3728 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3729 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3730 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3732 filt0 = (v16i8) __msa_fill_h(filt_const0);
3733 filt1 = (v16i8) __msa_fill_h(filt_const1);
3734 filt2 = (v16i8) __msa_fill_h(filt_const2);
3742 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3744 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
3747 for (loop_cnt = 4; loop_cnt--;) {
3752 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3753 src65_r, src76_r, src87_r);
3754 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3755 src65_l, src76_l, src87_l);
3756 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3758 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3759 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3760 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3761 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3762 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3763 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3765 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3767 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3768 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3769 out3_r, res0, res1, res2, res3);
3770 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3771 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3772 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3773 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3797 uint64_t tp0, tp1, tp2, tp3;
3798 const int16_t filt_const0 = 0xfb01;
3799 const int16_t filt_const1 = 0x1414;
3800 const int16_t filt_const2 = 0x1fb;
3801 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3802 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3803 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3804 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3805 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3806 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3808 filt0 = (v16i8) __msa_fill_h(filt_const0);
3809 filt1 = (v16i8) __msa_fill_h(filt_const1);
3810 filt2 = (v16i8) __msa_fill_h(filt_const2);
3818 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3820 LD_SB8(
src,
stride, src7, src8, src9, src10, src11, src12, src13, src14);
3821 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3822 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3823 src87_r, src98_r, src109_r);
3824 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3825 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3826 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3827 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3829 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3830 src21_r, src32_r, src43_r);
3831 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3832 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3833 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3834 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3835 PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3838 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3839 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3848 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3849 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3850 out0 = __msa_aver_s_b(out0, tmp0);
3851 out1 = __msa_aver_s_b(out1, tmp1);
3852 out2 = __msa_aver_s_b(out2, tmp2);
3853 out3 = __msa_aver_s_b(out3, tmp3);
3855 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3857 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
3863 uint64_t tp0, tp1, tp2, tp3;
3864 const int16_t filt_const0 = 0xfb01;
3865 const int16_t filt_const1 = 0x1414;
3866 const int16_t filt_const2 = 0x1fb;
3867 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3868 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3869 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3870 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3871 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3872 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3874 filt0 = (v16i8) __msa_fill_h(filt_const0);
3875 filt1 = (v16i8) __msa_fill_h(filt_const1);
3876 filt2 = (v16i8) __msa_fill_h(filt_const2);
3884 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3886 LD_SB8(
src,
stride, src7, src8, src9, src10, src11, src12, src13, src14);
3887 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3888 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3889 src87_r, src98_r, src109_r);
3890 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3891 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3892 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3893 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3895 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3896 src21_r, src32_r, src43_r);
3897 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3898 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3899 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3900 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3901 PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3904 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3905 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3914 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3915 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3916 out0 = __msa_aver_s_b(out0, tmp0);
3917 out1 = __msa_aver_s_b(out1, tmp1);
3918 out2 = __msa_aver_s_b(out2, tmp2);
3919 out3 = __msa_aver_s_b(out3, tmp3);
3921 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3923 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
3929 uint32_t tp0, tp1, tp2, tp3;
3930 int16_t filt_const0 = 0xfb01;
3931 int16_t filt_const1 = 0x1414;
3932 int16_t filt_const2 = 0x1fb;
3933 v16u8 res, dst0 = { 0 };
3934 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3935 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3936 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3939 filt0 = (v16i8) __msa_fill_h(filt_const0);
3940 filt1 = (v16i8) __msa_fill_h(filt_const1);
3941 filt2 = (v16i8) __msa_fill_h(filt_const2);
3947 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3949 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3952 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3954 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3956 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3957 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3958 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3959 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3960 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3966 res = __msa_aver_u_b(res, (v16u8) src32_r);
3967 dst0 = __msa_aver_u_b(res, dst0);
3974 uint32_t tp0, tp1, tp2, tp3;
3975 int16_t filt_const0 = 0xfb01;
3976 int16_t filt_const1 = 0x1414;
3977 int16_t filt_const2 = 0x1fb;
3978 v16u8 res, dst0 = { 0 };
3979 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3980 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3981 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3984 filt0 = (v16i8) __msa_fill_h(filt_const0);
3985 filt1 = (v16i8) __msa_fill_h(filt_const1);
3986 filt2 = (v16i8) __msa_fill_h(filt_const2);
3993 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
3995 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3998 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4000 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4002 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4003 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4009 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4010 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4011 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4012 res = __msa_aver_u_b(res, (v16u8) src32_r);
4013 dst0 = __msa_aver_u_b(res, dst0);
4064 sizeof(uint8_t), dst,
stride);
4080 sizeof(uint8_t), dst,
stride);
4097 sizeof(uint8_t), dst,
stride);
4113 sizeof(uint8_t), dst,
stride);
4119 uint64_t tp0, tp1, tp2, tp3;
4120 uint8_t *dst_tmp = dst;
4121 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
4122 uint32_t multiple8_cnt, loop_cnt;
4123 const int32_t filt_const0 = 0xfffb0001;
4124 const int32_t filt_const1 = 0x140014;
4125 const int32_t filt_const2 = 0x1fffb;
4126 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4127 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4129 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4130 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4131 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4132 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4133 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4134 v8i16 hz_out87_l, filt0, filt1, filt2;
4135 v4i32 tmp0_w, tmp1_w;
4137 filt0 = (v8i16) __msa_fill_w(filt_const0);
4138 filt1 = (v8i16) __msa_fill_w(filt_const1);
4139 filt2 = (v8i16) __msa_fill_w(filt_const2);
4143 for (multiple8_cnt = 2; multiple8_cnt--;) {
4157 for (loop_cnt = 4; loop_cnt--;) {
4164 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4165 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4167 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4168 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4170 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4172 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4174 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4176 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4178 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4179 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4181 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4183 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4185 tmp1 = __msa_srari_h(hz_out2, 5);
4186 tmp3 = __msa_srari_h(hz_out3, 5);
4189 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4190 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4196 dst0 = __msa_aver_u_b(out0, dst0);
4206 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4208 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4210 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4212 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4214 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4215 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4217 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4219 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4221 tmp5 = __msa_srari_h(hz_out4, 5);
4222 tmp7 = __msa_srari_h(hz_out5, 5);
4225 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4226 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4232 dst1 = __msa_aver_u_b(out1, dst1);
4251 uint64_t tp0, tp1, tp2, tp3;
4252 uint8_t *dst_tmp = dst;
4253 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
4254 uint32_t multiple8_cnt, loop_cnt;
4255 const int32_t filt_const0 = 0xfffb0001;
4256 const int32_t filt_const1 = 0x140014;
4257 const int32_t filt_const2 = 0x1fffb;
4258 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4259 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4261 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4262 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4263 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4264 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4265 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4266 v8i16 hz_out87_l, filt0, filt1, filt2;
4267 v4i32 tmp0_w, tmp1_w;
4269 filt0 = (v8i16) __msa_fill_w(filt_const0);
4270 filt1 = (v8i16) __msa_fill_w(filt_const1);
4271 filt2 = (v8i16) __msa_fill_w(filt_const2);
4275 for (multiple8_cnt = 2; multiple8_cnt--;) {
4289 for (loop_cnt = 4; loop_cnt--;) {
4296 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4297 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4299 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4300 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4302 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4303 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4305 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4307 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4309 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4310 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4312 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4314 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4316 tmp1 = __msa_srari_h(hz_out3, 5);
4317 tmp3 = __msa_srari_h(hz_out4, 5);
4320 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4321 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4326 dst0 = __msa_aver_u_b(out0, dst0);
4336 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4338 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4340 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4342 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4344 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4345 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4347 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4349 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4351 tmp5 = __msa_srari_h(hz_out5, 5);
4352 tmp7 = __msa_srari_h(hz_out6, 5);
4355 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4356 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4361 dst1 = __msa_aver_u_b(out1, dst1);
4380 const int32_t filt_const0 = 0xfffb0001;
4381 const int32_t filt_const1 = 0x140014;
4382 const int32_t filt_const2 = 0x1fffb;
4383 uint64_t tp0, tp1, tp2, tp3;
4384 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4385 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4386 v16i8 src11, src12, mask0, mask1, mask2;
4387 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4388 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4389 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4390 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4391 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4392 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4393 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4394 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4395 v4i32 tmp0_w, tmp1_w;
4399 filt0 = (v8i16) __msa_fill_w(filt_const0);
4400 filt1 = (v8i16) __msa_fill_w(filt_const1);
4401 filt2 = (v8i16) __msa_fill_w(filt_const2);
4424 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4425 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4426 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4428 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4429 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4430 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4433 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4435 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4437 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4438 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4440 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4442 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4443 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4445 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4447 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4448 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4450 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4452 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4454 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4455 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4461 tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4462 tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4463 tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4464 tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4478 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4479 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4481 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4482 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4484 tmp0_w =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4486 tmp1_w =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4488 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4489 tmp0_w =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4491 tmp1_w =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4493 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4494 tmp0_w =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4496 tmp1_w =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4498 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4499 tmp0_w =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4501 tmp1_w =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4503 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4505 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4506 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4512 tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4513 tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4514 tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4515 tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4526 const int32_t filt_const0 = 0xfffb0001;
4527 const int32_t filt_const1 = 0x140014;
4528 const int32_t filt_const2 = 0x1fffb;
4529 uint64_t tp0, tp1, tp2, tp3;
4530 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4531 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4532 v16i8 src11, src12, mask0, mask1, mask2;
4533 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4534 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4535 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4536 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4537 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4538 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4539 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4540 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4541 v4i32 tmp0_w, tmp1_w;
4545 filt0 = (v8i16) __msa_fill_w(filt_const0);
4546 filt1 = (v8i16) __msa_fill_w(filt_const1);
4547 filt2 = (v8i16) __msa_fill_w(filt_const2);
4570 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4571 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4572 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4574 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4575 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4576 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4579 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4581 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4583 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4584 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4586 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4588 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4589 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4591 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4593 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4594 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4596 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4598 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4600 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4601 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4607 tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4608 tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4609 tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4610 tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4624 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4625 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4627 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4628 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4630 tmp0_w =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4632 tmp1_w =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4634 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4635 tmp0_w =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4637 tmp1_w =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4639 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4640 tmp0_w =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4642 tmp1_w =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4644 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4645 tmp0_w =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4647 tmp1_w =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4649 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4651 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4652 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4658 tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4659 tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4660 tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4661 tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4672 uint32_t tp0, tp1, tp2, tp3;
4673 const int32_t filt_const0 = 0xfffb0001;
4674 const int32_t filt_const1 = 0x140014;
4675 const int32_t filt_const2 = 0x1fffb;
4676 v16u8 res,
out = { 0 };
4677 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4678 v16i8 mask0, mask1, mask2;
4679 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4680 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4681 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4682 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4687 filt0 = (v8i16) __msa_fill_w(filt_const0);
4688 filt1 = (v8i16) __msa_fill_w(filt_const1);
4689 filt2 = (v8i16) __msa_fill_w(filt_const2);
4705 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4706 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4708 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4709 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4710 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4711 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4713 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4715 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4717 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4718 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4720 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4722 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4727 dst0 = __msa_aver_s_h(dst0, hz_out2);
4728 dst1 = __msa_aver_s_h(dst1, hz_out4);
4732 res = __msa_aver_u_b(res,
out);
4739 const int32_t filt_const0 = 0xfffb0001;
4740 const int32_t filt_const1 = 0x140014;
4741 const int32_t filt_const2 = 0x1fffb;
4742 uint32_t tp0, tp1, tp2, tp3;
4743 v16u8 res,
out = { 0 };
4744 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4745 v16i8 mask0, mask1, mask2;
4746 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4747 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4748 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4749 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4754 filt0 = (v8i16) __msa_fill_w(filt_const0);
4755 filt1 = (v8i16) __msa_fill_w(filt_const1);
4756 filt2 = (v8i16) __msa_fill_w(filt_const2);
4772 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4773 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4775 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4776 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4777 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4778 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4780 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4782 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4784 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4785 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4787 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4789 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4791 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4795 dst0 = __msa_aver_s_h(dst0, hz_out0);
4796 dst1 = __msa_aver_s_h(dst1, hz_out1);
4800 res = __msa_aver_u_b(res,
out);
4808 int16_t filt_const0 = 0xfb01;
4809 int16_t filt_const1 = 0x1414;
4810 int16_t filt_const2 = 0x1fb;
4811 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4812 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4813 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4814 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4815 v16i8 src65_l, src87_l, filt0, filt1, filt2;
4816 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4818 filt0 = (v16i8) __msa_fill_h(filt_const0);
4819 filt1 = (v16i8) __msa_fill_h(filt_const1);
4820 filt2 = (v16i8) __msa_fill_h(filt_const2);
4827 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
4829 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
4832 for (loop_cnt = 4; loop_cnt--;) {
4837 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4838 src65_r, src76_r, src87_r);
4839 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4840 src65_l, src76_l, src87_l);
4841 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4842 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4843 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4844 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4845 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4846 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4847 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4848 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4850 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4852 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4854 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4855 out3_r, res0, res1, res2, res3);
4877 uint64_t tp0, tp1, tp2, tp3;
4878 const int16_t filt_const0 = 0xfb01;
4879 const int16_t filt_const1 = 0x1414;
4880 const int16_t filt_const2 = 0x1fb;
4881 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4882 v16u8 out0, out1, out2, out3;
4883 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4884 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4885 v16i8 filt0, filt1, filt2;
4886 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4888 filt0 = (v16i8) __msa_fill_h(filt_const0);
4889 filt1 = (v16i8) __msa_fill_h(filt_const1);
4890 filt2 = (v16i8) __msa_fill_h(filt_const2);
4898 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
4904 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4905 src87_r, src98_r, src109_r);
4906 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4907 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4908 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4909 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4914 src21_r, src32_r, src43_r);
4915 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4916 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4917 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4918 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4929 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4930 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4935 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4937 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
4943 uint32_t tp0, tp1, tp2, tp3;
4944 int16_t filt_const0 = 0xfb01;
4945 int16_t filt_const1 = 0x1414;
4946 int16_t filt_const2 = 0x1fb;
4947 v16u8 res, dst0 = { 0 };
4948 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4949 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4950 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4953 filt0 = (v16i8) __msa_fill_h(filt_const0);
4954 filt1 = (v16i8) __msa_fill_h(filt_const1);
4955 filt2 = (v16i8) __msa_fill_h(filt_const2);
4961 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
4963 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4966 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4968 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4970 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4971 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4977 dst0 = __msa_aver_u_b(res, dst0);
4986 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4988 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4989 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4990 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4991 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4992 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4993 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4994 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4995 v8i16 minus5h = __msa_ldi_h(-5);
4996 v8i16 plus20h = __msa_ldi_h(20);
5010 for (row = 16; row--;) {
5020 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5021 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5022 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5023 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5024 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5025 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5026 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5027 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5028 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5029 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5030 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5031 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5032 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5033 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5034 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5035 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5036 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5037 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5038 tmp0 = __msa_srari_h(shf_vec2, 5);
5039 tmp1 = __msa_srari_h(shf_vec5, 5);
5040 tmp2 = __msa_srari_h(shf_vec8, 5);
5041 tmp3 = __msa_srari_h(shf_vec11, 5);
5044 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5045 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5046 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5048 out = __msa_aver_u_b(
out, dst0);
5070 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5072 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5073 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5074 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5075 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5076 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5077 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5078 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5079 v8i16 minus5h = __msa_ldi_h(-5);
5080 v8i16 plus20h = __msa_ldi_h(20);
5094 for (row = 16; row--;) {
5104 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5105 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5106 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5107 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5108 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5109 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5110 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5111 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5112 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5113 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5114 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5115 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5116 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5117 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5118 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5119 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5120 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5121 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5122 tmp0 = __msa_srari_h(shf_vec2, 5);
5123 tmp1 = __msa_srari_h(shf_vec5, 5);
5124 tmp2 = __msa_srari_h(shf_vec8, 5);
5125 tmp3 = __msa_srari_h(shf_vec11, 5);
5127 tmp0 = __msa_pckod_h(tmp2, tmp0);
5128 tmp1 = __msa_pckod_h(tmp3, tmp1);
5129 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5130 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5131 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5133 out = __msa_aver_u_b(
out, dst0);
5155 v16u8
out, dst0 = { 0 };
5156 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5157 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5158 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5159 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5160 v8i16 mask3, mask4, mask5;
5161 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5162 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5163 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5164 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5165 v8i16 minus5h = __msa_ldi_h(-5);
5166 v8i16 plus20h = __msa_ldi_h(20);
5178 for (row = 4; row--;) {
5187 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5188 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5189 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5190 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5191 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5192 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5193 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5194 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5195 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5196 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5197 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5198 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5199 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5200 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5201 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5202 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5203 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5204 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5205 tmp0 = __msa_srari_h(shf_vec2, 5);
5206 tmp1 = __msa_srari_h(shf_vec5, 5);
5207 tmp2 = __msa_srari_h(shf_vec8, 5);
5208 tmp3 = __msa_srari_h(shf_vec11, 5);
5213 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5214 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5215 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5217 out = __msa_aver_u_b(
out, dst0);
5234 v16u8
out, dst0 = { 0 };
5235 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5236 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5237 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5238 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5239 v8i16 mask3, mask4, mask5;
5240 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5241 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5242 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5243 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5244 v8i16 minus5h = __msa_ldi_h(-5);
5245 v8i16 plus20h = __msa_ldi_h(20);
5257 for (row = 4; row--;) {
5266 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5267 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5268 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5269 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5270 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5271 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5272 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5273 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5274 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5275 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5276 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5277 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5278 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5279 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5280 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5281 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5282 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5283 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5284 tmp0 = __msa_srari_h(shf_vec2, 5);
5285 tmp1 = __msa_srari_h(shf_vec5, 5);
5286 tmp2 = __msa_srari_h(shf_vec8, 5);
5287 tmp3 = __msa_srari_h(shf_vec11, 5);
5291 tmp0 = __msa_pckod_h(tmp2, tmp0);
5292 tmp1 = __msa_pckod_h(tmp3, tmp1);
5293 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5294 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5295 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5297 out = __msa_aver_u_b(
out, dst0);
5312 uint32_t tp0, tp1, tp2, tp3;
5313 const int16_t filt_const0 = 0xfb01;
5314 const int16_t filt_const1 = 0x1414;
5315 const int16_t filt_const2 = 0x1fb;
5316 v16u8
out, dstv = { 0 };
5317 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5318 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5319 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5320 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5321 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5322 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5323 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5324 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5325 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5326 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5327 v8i16 minus5h = __msa_ldi_h(-5);
5328 v8i16 plus20h = __msa_ldi_h(20);
5329 v8i16 zeros = { 0 };
5331 filt0 = (v16i8) __msa_fill_h(filt_const0);
5332 filt1 = (v16i8) __msa_fill_h(filt_const1);
5333 filt2 = (v16i8) __msa_fill_h(filt_const2);
5343 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
5345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5347 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
5349 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5351 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5352 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5353 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5354 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5355 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5356 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5357 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5358 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5359 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5360 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5361 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5362 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5364 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5365 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5366 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5367 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5368 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5369 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5370 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5371 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5372 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5373 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5374 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5375 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5382 dst0 = __msa_srari_h(shf_vec2, 5);
5383 dst1 = __msa_srari_h(shf_vec5, 5);
5384 dst2 = __msa_srari_h(shf_vec6, 5);
5385 dst3 = __msa_srari_h(shf_vec7, 5);
5389 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5390 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5392 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5393 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5394 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5395 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5399 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5401 out = __msa_aver_u_b(
out, dstv);
5408 uint32_t tp0, tp1, tp2, tp3;
5409 const int16_t filt_const0 = 0xfb01;
5410 const int16_t filt_const1 = 0x1414;
5411 const int16_t filt_const2 = 0x1fb;
5412 v16u8
out, dstv = { 0 };
5413 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5414 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5415 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5416 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5417 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5418 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5419 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5420 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5421 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5422 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5423 v8i16 minus5h = __msa_ldi_h(-5);
5424 v8i16 plus20h = __msa_ldi_h(20);
5425 v8i16 zeros = { 0 };
5427 filt0 = (v16i8) __msa_fill_h(filt_const0);
5428 filt1 = (v16i8) __msa_fill_h(filt_const1);
5429 filt2 = (v16i8) __msa_fill_h(filt_const2);
5439 ILVR_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_r, src21_r,
5441 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5443 ILVL_B4_SB(
src1,
src0, src2,
src1, src3, src2, src4, src3, src10_l, src21_l,
5445 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5447 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5448 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5449 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5450 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5451 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5452 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5453 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5454 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5455 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5456 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5457 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5458 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5460 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5461 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5462 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5463 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5464 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5465 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5466 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5467 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5468 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5469 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5470 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5471 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5478 dst0 = __msa_srari_h(shf_vec2, 5);
5479 dst1 = __msa_srari_h(shf_vec5, 5);
5480 dst2 = __msa_srari_h(shf_vec6, 5);
5481 dst3 = __msa_srari_h(shf_vec7, 5);
5486 dst0 = __msa_ilvod_h(zeros, dst0);
5487 dst1 = __msa_ilvod_h(zeros, dst1);
5488 dst2 = __msa_ilvod_h(zeros, dst2);
5489 dst3 = __msa_ilvod_h(zeros, dst3);
5491 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5492 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5493 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5494 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5498 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5500 out = __msa_aver_u_b(
out, dstv);
5507 const int32_t filt_const0 = 0xfffb0001;
5508 const int32_t filt_const1 = 0x140014;
5509 const int32_t filt_const2 = 0x1fffb;
5510 const uint8_t *src_tmp =
src - (2 *
stride) - 2;
5511 uint8_t *dst_tmp = dst;
5512 uint64_t tp0, tp1, tp2, tp3;
5513 uint32_t multiple8_cnt, loop_cnt;
5514 v16u8 dst0, dst1, out0, out1;
5515 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
5516 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5517 v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5518 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5519 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5520 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5521 v8i16 hz_out87_l, filt0, filt1, filt2;
5524 filt0 = (v8i16) __msa_fill_w(filt_const0);
5525 filt1 = (v8i16) __msa_fill_w(filt_const1);
5526 filt2 = (v8i16) __msa_fill_w(filt_const2);
5530 for (multiple8_cnt = 2; multiple8_cnt--;) {
5544 for (loop_cnt = 4; loop_cnt--;) {
5553 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5554 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5556 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5557 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5559 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5560 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5562 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5563 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5570 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5575 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5580 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5585 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5611 const int32_t filt_const0 = 0xfffb0001;
5612 const int32_t filt_const1 = 0x140014;
5613 const int32_t filt_const2 = 0x1fffb;
5614 uint64_t tp0, tp1, tp2, tp3;
5615 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5616 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
5617 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5618 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5619 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5620 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5621 v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5622 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5623 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5624 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5627 filt0 = (v8i16) __msa_fill_w(filt_const0);
5628 filt1 = (v8i16) __msa_fill_w(filt_const1);
5629 filt2 = (v8i16) __msa_fill_w(filt_const2);
5651 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5652 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5653 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5655 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5656 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5657 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5660 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5662 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5664 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5665 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5667 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5669 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5670 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5672 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5674 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5675 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5677 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5679 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5695 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5696 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5698 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5699 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5701 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5703 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5705 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5706 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5708 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5710 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5711 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5713 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5715 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5716 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5718 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5720 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5733 const int32_t filt_const0 = 0xfffb0001;
5734 const int32_t filt_const1 = 0x140014;
5735 const int32_t filt_const2 = 0x1fffb;
5736 uint32_t tp0, tp1, tp2, tp3;
5737 v16u8 res, dst0 = { 0 };
5738 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5739 v16i8 mask0, mask1, mask2;
5740 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5741 v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5742 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5743 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5748 filt0 = (v8i16) __msa_fill_w(filt_const0);
5749 filt1 = (v8i16) __msa_fill_w(filt_const1);
5750 filt2 = (v8i16) __msa_fill_w(filt_const2);
5765 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5766 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5767 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5768 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5769 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5770 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5772 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5774 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5776 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5777 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5779 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5781 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5785 res = __msa_aver_u_b(res, dst0);